From e42e28c709c08347fdb8f05ebc94e39f6532797c Mon Sep 17 00:00:00 2001 From: Evgenii Shatokhin Date: Tue, 26 Jul 2016 11:24:24 +0300 Subject: [PATCH] Keep the patches in the kernel project instead of a separate one Earlier, the patches were kept in the respective branches of a separate project, https://abf.io/soft/kernel-patches-and-configs. And before that - in the custom tarballs. Now all the patches are kept here along with the spec file and are applied the default way rather than by separate scripts. This should make the maintenance of the patches as well as the experiments with the new ones a lot easier. The previous scheme seemed to offer a bit more flexibility (different patch sets for different cases) at the cost of maintenance. But as it turned out, that flexibility was not worth it and was rarely used, at most. --- ...onfig-build-bits-for-BFQ-v7r11-4.5.0.patch | 103 + ...ce-the-BFQ-v7r11-I-O-sched-for-4.5.0.patch | 7097 +++ ...rly-Queue-Merge-EQM-to-BFQ-v7r11-for.patch | 1101 + ...over-ide-drivers-when-both-are-built.patch | 34 + audit-make-it-less-verbose.patch | 22 + block-floppy-disable-pnp-modalias.patch | 21 + drm-cirrus-Use-16bpp-as-default.patch | 49 + fs-aufs4.patch | 35794 ++++++++++++++++ hp-wmi-rfkill-fix.patch | 31 + include-kbuild-export-pci_ids.patch | 19 + inotify-increase-max-user-watches.patch | 20 + kernel.spec | 65 +- perf-python-ext-link-with-dl.patch | 22 + perf-xmlto-skip-validation.patch | 13 + ...inimum-vmalloc-area-by-64MB-to-192MB.patch | 28 + 15 files changed, 44401 insertions(+), 18 deletions(-) create mode 100644 0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.5.0.patch create mode 100644 0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.5.0.patch create mode 100644 0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for.patch create mode 100644 ata-prefer-ata-drivers-over-ide-drivers-when-both-are-built.patch create mode 100644 audit-make-it-less-verbose.patch create mode 100644 block-floppy-disable-pnp-modalias.patch create mode 100644 drm-cirrus-Use-16bpp-as-default.patch create mode 100644 fs-aufs4.patch create mode 100644 hp-wmi-rfkill-fix.patch create mode 100644 include-kbuild-export-pci_ids.patch create mode 100644 inotify-increase-max-user-watches.patch create mode 100644 perf-python-ext-link-with-dl.patch create mode 100644 perf-xmlto-skip-validation.patch create mode 100644 x86-increase-default-minimum-vmalloc-area-by-64MB-to-192MB.patch diff --git a/0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.5.0.patch b/0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.5.0.patch new file mode 100644 index 0000000..17ee130 --- /dev/null +++ b/0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.5.0.patch @@ -0,0 +1,103 @@ +From f54f3003586bf00ba0ee5974a92b732477b834e3 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Tue, 7 Apr 2015 13:39:12 +0200 +Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7r11-4.5.0 + +Update Kconfig.iosched and do the related Makefile changes to include +kernel configuration options for BFQ. Also increase the number of +policies supported by the blkio controller so that BFQ can add its +own. + +Signed-off-by: Paolo Valente +Signed-off-by: Arianna Avanzini +--- + block/Kconfig.iosched | 32 ++++++++++++++++++++++++++++++++ + block/Makefile | 1 + + include/linux/blkdev.h | 2 +- + 3 files changed, 34 insertions(+), 1 deletion(-) + +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched +index 421bef9..0ee5f0f 100644 +--- a/block/Kconfig.iosched ++++ b/block/Kconfig.iosched +@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED + ---help--- + Enable group IO scheduling in CFQ. + ++config IOSCHED_BFQ ++ tristate "BFQ I/O scheduler" ++ default n ++ ---help--- ++ The BFQ I/O scheduler tries to distribute bandwidth among ++ all processes according to their weights. ++ It aims at distributing the bandwidth as desired, independently of ++ the disk parameters and with any workload. It also tries to ++ guarantee low latency to interactive and soft real-time ++ applications. If compiled built-in (saying Y here), BFQ can ++ be configured to support hierarchical scheduling. ++ ++config CGROUP_BFQIO ++ bool "BFQ hierarchical scheduling support" ++ depends on CGROUPS && IOSCHED_BFQ=y ++ default n ++ ---help--- ++ Enable hierarchical scheduling in BFQ, using the cgroups ++ filesystem interface. The name of the subsystem will be ++ bfqio. ++ + choice + prompt "Default I/O scheduler" + default DEFAULT_CFQ +@@ -52,6 +73,16 @@ choice + config DEFAULT_CFQ + bool "CFQ" if IOSCHED_CFQ=y + ++ config DEFAULT_BFQ ++ bool "BFQ" if IOSCHED_BFQ=y ++ help ++ Selects BFQ as the default I/O scheduler which will be ++ used by default for all block devices. ++ The BFQ I/O scheduler aims at distributing the bandwidth ++ as desired, independently of the disk parameters and with ++ any workload. It also tries to guarantee low latency to ++ interactive and soft real-time applications. ++ + config DEFAULT_NOOP + bool "No-op" + +@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED + string + default "deadline" if DEFAULT_DEADLINE + default "cfq" if DEFAULT_CFQ ++ default "bfq" if DEFAULT_BFQ + default "noop" if DEFAULT_NOOP + + endmenu +diff --git a/block/Makefile b/block/Makefile +index 00ecc97..1ed86d5 100644 +--- a/block/Makefile ++++ b/block/Makefile +@@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o + obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o + obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o + obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o ++obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o + + obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o + obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index c70e358..ae43492 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -44,7 +44,7 @@ struct pr_ops; + * Maximum number of blkcg policies allowed to be registered concurrently. + * Defined here to simplify include dependency. + */ +-#define BLKCG_MAX_POLS 2 ++#define BLKCG_MAX_POLS 3 + + struct request; + typedef void (rq_end_io_fn)(struct request *, int); +-- +1.9.1 + diff --git a/0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.5.0.patch b/0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.5.0.patch new file mode 100644 index 0000000..6095fa9 --- /dev/null +++ b/0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.5.0.patch @@ -0,0 +1,7097 @@ +From 03d30cc06a5436c05ee338bd21903802181bafe9 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Thu, 9 May 2013 19:10:02 +0200 +Subject: [PATCH 2/3] block: introduce the BFQ-v7r11 I/O sched for 4.5.0 + +The general structure is borrowed from CFQ, as much of the code for +handling I/O contexts. Over time, several useful features have been +ported from CFQ as well (details in the changelog in README.BFQ). A +(bfq_)queue is associated to each task doing I/O on a device, and each +time a scheduling decision has to be made a queue is selected and served +until it expires. + + - Slices are given in the service domain: tasks are assigned + budgets, measured in number of sectors. Once got the disk, a task + must however consume its assigned budget within a configurable + maximum time (by default, the maximum possible value of the + budgets is automatically computed to comply with this timeout). + This allows the desired latency vs "throughput boosting" tradeoff + to be set. + + - Budgets are scheduled according to a variant of WF2Q+, implemented + using an augmented rb-tree to take eligibility into account while + preserving an O(log N) overall complexity. + + - A low-latency tunable is provided; if enabled, both interactive + and soft real-time applications are guaranteed a very low latency. + + - Latency guarantees are preserved also in the presence of NCQ. + + - Also with flash-based devices, a high throughput is achieved + while still preserving latency guarantees. + + - BFQ features Early Queue Merge (EQM), a sort of fusion of the + cooperating-queue-merging and the preemption mechanisms present + in CFQ. EQM is in fact a unified mechanism that tries to get a + sequential read pattern, and hence a high throughput, with any + set of processes performing interleaved I/O over a contiguous + sequence of sectors. + + - BFQ supports full hierarchical scheduling, exporting a cgroups + interface. Since each node has a full scheduler, each group can + be assigned its own weight. + + - If the cgroups interface is not used, only I/O priorities can be + assigned to processes, with ioprio values mapped to weights + with the relation weight = IOPRIO_BE_NR - ioprio. + + - ioprio classes are served in strict priority order, i.e., lower + priority queues are not served as long as there are higher + priority queues. Among queues in the same class the bandwidth is + distributed in proportion to the weight of each queue. A very + thin extra bandwidth is however guaranteed to the Idle class, to + prevent it from starving. + +Signed-off-by: Paolo Valente +Signed-off-by: Arianna Avanzini +--- + block/Kconfig.iosched | 6 +- + block/bfq-cgroup.c | 1182 ++++++++++++++++ + block/bfq-ioc.c | 36 + + block/bfq-iosched.c | 3754 +++++++++++++++++++++++++++++++++++++++++++++++++ + block/bfq-sched.c | 1200 ++++++++++++++++ + block/bfq.h | 801 +++++++++++ + 6 files changed, 6975 insertions(+), 4 deletions(-) + create mode 100644 block/bfq-cgroup.c + create mode 100644 block/bfq-ioc.c + create mode 100644 block/bfq-iosched.c + create mode 100644 block/bfq-sched.c + create mode 100644 block/bfq.h + +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched +index 0ee5f0f..f78cd1a 100644 +--- a/block/Kconfig.iosched ++++ b/block/Kconfig.iosched +@@ -51,14 +51,12 @@ config IOSCHED_BFQ + applications. If compiled built-in (saying Y here), BFQ can + be configured to support hierarchical scheduling. + +-config CGROUP_BFQIO ++config BFQ_GROUP_IOSCHED + bool "BFQ hierarchical scheduling support" + depends on CGROUPS && IOSCHED_BFQ=y + default n + ---help--- +- Enable hierarchical scheduling in BFQ, using the cgroups +- filesystem interface. The name of the subsystem will be +- bfqio. ++ Enable hierarchical scheduling in BFQ, using the blkio controller. + + choice + prompt "Default I/O scheduler" +diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c +new file mode 100644 +index 0000000..8610cd6 +--- /dev/null ++++ b/block/bfq-cgroup.c +@@ -0,0 +1,1182 @@ ++/* ++ * BFQ: CGROUPS support. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2010 Paolo Valente ++ * ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ ++ * file. ++ */ ++ ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ ++/* bfqg stats flags */ ++enum bfqg_stats_flags { ++ BFQG_stats_waiting = 0, ++ BFQG_stats_idling, ++ BFQG_stats_empty, ++}; ++ ++#define BFQG_FLAG_FNS(name) \ ++static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ ++{ \ ++ stats->flags |= (1 << BFQG_stats_##name); \ ++} \ ++static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ ++{ \ ++ stats->flags &= ~(1 << BFQG_stats_##name); \ ++} \ ++static int bfqg_stats_##name(struct bfqg_stats *stats) \ ++{ \ ++ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ ++} \ ++ ++BFQG_FLAG_FNS(waiting) ++BFQG_FLAG_FNS(idling) ++BFQG_FLAG_FNS(empty) ++#undef BFQG_FLAG_FNS ++ ++/* This should be called with the queue_lock held. */ ++static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) ++{ ++ unsigned long long now; ++ ++ if (!bfqg_stats_waiting(stats)) ++ return; ++ ++ now = sched_clock(); ++ if (time_after64(now, stats->start_group_wait_time)) ++ blkg_stat_add(&stats->group_wait_time, ++ now - stats->start_group_wait_time); ++ bfqg_stats_clear_waiting(stats); ++} ++ ++/* This should be called with the queue_lock held. */ ++static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, ++ struct bfq_group *curr_bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ if (bfqg_stats_waiting(stats)) ++ return; ++ if (bfqg == curr_bfqg) ++ return; ++ stats->start_group_wait_time = sched_clock(); ++ bfqg_stats_mark_waiting(stats); ++} ++ ++/* This should be called with the queue_lock held. */ ++static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) ++{ ++ unsigned long long now; ++ ++ if (!bfqg_stats_empty(stats)) ++ return; ++ ++ now = sched_clock(); ++ if (time_after64(now, stats->start_empty_time)) ++ blkg_stat_add(&stats->empty_time, ++ now - stats->start_empty_time); ++ bfqg_stats_clear_empty(stats); ++} ++ ++static void bfqg_stats_update_dequeue(struct bfq_group *bfqg) ++{ ++ blkg_stat_add(&bfqg->stats.dequeue, 1); ++} ++ ++static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ if (blkg_rwstat_total(&stats->queued)) ++ return; ++ ++ /* ++ * group is already marked empty. This can happen if bfqq got new ++ * request in parent group and moved to this group while being added ++ * to service tree. Just ignore the event and move on. ++ */ ++ if (bfqg_stats_empty(stats)) ++ return; ++ ++ stats->start_empty_time = sched_clock(); ++ bfqg_stats_mark_empty(stats); ++} ++ ++static void bfqg_stats_update_idle_time(struct bfq_group *bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ if (bfqg_stats_idling(stats)) { ++ unsigned long long now = sched_clock(); ++ ++ if (time_after64(now, stats->start_idle_time)) ++ blkg_stat_add(&stats->idle_time, ++ now - stats->start_idle_time); ++ bfqg_stats_clear_idling(stats); ++ } ++} ++ ++static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ stats->start_idle_time = sched_clock(); ++ bfqg_stats_mark_idling(stats); ++} ++ ++static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ blkg_stat_add(&stats->avg_queue_size_sum, ++ blkg_rwstat_total(&stats->queued)); ++ blkg_stat_add(&stats->avg_queue_size_samples, 1); ++ bfqg_stats_update_group_wait_time(stats); ++} ++ ++static struct blkcg_policy blkcg_policy_bfq; ++ ++/* ++ * blk-cgroup policy-related handlers ++ * The following functions help in converting between blk-cgroup ++ * internal structures and BFQ-specific structures. ++ */ ++ ++static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) ++{ ++ return pd ? container_of(pd, struct bfq_group, pd) : NULL; ++} ++ ++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) ++{ ++ return pd_to_blkg(&bfqg->pd); ++} ++ ++static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) ++{ ++ struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); ++ BUG_ON(!pd); ++ return pd_to_bfqg(pd); ++} ++ ++/* ++ * bfq_group handlers ++ * The following functions help in navigating the bfq_group hierarchy ++ * by allowing to find the parent of a bfq_group or the bfq_group ++ * associated to a bfq_queue. ++ */ ++ ++static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) ++{ ++ struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; ++ ++ return pblkg ? blkg_to_bfqg(pblkg) : NULL; ++} ++ ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *group_entity = bfqq->entity.parent; ++ ++ return group_entity ? container_of(group_entity, struct bfq_group, ++ entity) : ++ bfqq->bfqd->root_group; ++} ++ ++/* ++ * The following two functions handle get and put of a bfq_group by ++ * wrapping the related blk-cgroup hooks. ++ */ ++ ++static void bfqg_get(struct bfq_group *bfqg) ++{ ++ return blkg_get(bfqg_to_blkg(bfqg)); ++} ++ ++static void bfqg_put(struct bfq_group *bfqg) ++{ ++ return blkg_put(bfqg_to_blkg(bfqg)); ++} ++ ++static void bfqg_stats_update_io_add(struct bfq_group *bfqg, ++ struct bfq_queue *bfqq, ++ int rw) ++{ ++ blkg_rwstat_add(&bfqg->stats.queued, rw, 1); ++ bfqg_stats_end_empty_time(&bfqg->stats); ++ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) ++ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); ++} ++ ++static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw) ++{ ++ blkg_rwstat_add(&bfqg->stats.queued, rw, -1); ++} ++ ++static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) ++{ ++ blkg_rwstat_add(&bfqg->stats.merged, rw, 1); ++} ++ ++static void bfqg_stats_update_dispatch(struct bfq_group *bfqg, ++ uint64_t bytes, int rw) ++{ ++ blkg_stat_add(&bfqg->stats.sectors, bytes >> 9); ++ blkg_rwstat_add(&bfqg->stats.serviced, rw, 1); ++ blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes); ++} ++ ++static void bfqg_stats_update_completion(struct bfq_group *bfqg, ++ uint64_t start_time, uint64_t io_start_time, int rw) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ unsigned long long now = sched_clock(); ++ ++ if (time_after64(now, io_start_time)) ++ blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); ++ if (time_after64(io_start_time, start_time)) ++ blkg_rwstat_add(&stats->wait_time, rw, ++ io_start_time - start_time); ++} ++ ++/* @stats = 0 */ ++static void bfqg_stats_reset(struct bfqg_stats *stats) ++{ ++ if (!stats) ++ return; ++ ++ /* queued stats shouldn't be cleared */ ++ blkg_rwstat_reset(&stats->service_bytes); ++ blkg_rwstat_reset(&stats->serviced); ++ blkg_rwstat_reset(&stats->merged); ++ blkg_rwstat_reset(&stats->service_time); ++ blkg_rwstat_reset(&stats->wait_time); ++ blkg_stat_reset(&stats->time); ++ blkg_stat_reset(&stats->unaccounted_time); ++ blkg_stat_reset(&stats->avg_queue_size_sum); ++ blkg_stat_reset(&stats->avg_queue_size_samples); ++ blkg_stat_reset(&stats->dequeue); ++ blkg_stat_reset(&stats->group_wait_time); ++ blkg_stat_reset(&stats->idle_time); ++ blkg_stat_reset(&stats->empty_time); ++} ++ ++/* @to += @from */ ++static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from) ++{ ++ if (!to || !from) ++ return; ++ ++ /* queued stats shouldn't be cleared */ ++ blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes); ++ blkg_rwstat_add_aux(&to->serviced, &from->serviced); ++ blkg_rwstat_add_aux(&to->merged, &from->merged); ++ blkg_rwstat_add_aux(&to->service_time, &from->service_time); ++ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); ++ blkg_stat_add_aux(&from->time, &from->time); ++ blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); ++ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); ++ blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); ++ blkg_stat_add_aux(&to->dequeue, &from->dequeue); ++ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); ++ blkg_stat_add_aux(&to->idle_time, &from->idle_time); ++ blkg_stat_add_aux(&to->empty_time, &from->empty_time); ++} ++ ++/* ++ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors' ++ * recursive stats can still account for the amount used by this bfqg after ++ * it's gone. ++ */ ++static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) ++{ ++ struct bfq_group *parent; ++ ++ if (!bfqg) /* root_group */ ++ return; ++ ++ parent = bfqg_parent(bfqg); ++ ++ lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); ++ ++ if (unlikely(!parent)) ++ return; ++ ++ bfqg_stats_merge(&parent->dead_stats, &bfqg->stats); ++ bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats); ++ bfqg_stats_reset(&bfqg->stats); ++ bfqg_stats_reset(&bfqg->dead_stats); ++} ++ ++static void bfq_init_entity(struct bfq_entity *entity, ++ struct bfq_group *bfqg) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ entity->weight = entity->new_weight; ++ entity->orig_weight = entity->new_weight; ++ if (bfqq) { ++ bfqq->ioprio = bfqq->new_ioprio; ++ bfqq->ioprio_class = bfqq->new_ioprio_class; ++ bfqg_get(bfqg); ++ } ++ entity->parent = bfqg->my_entity; ++ entity->sched_data = &bfqg->sched_data; ++} ++ ++static void bfqg_stats_exit(struct bfqg_stats *stats) ++{ ++ blkg_rwstat_exit(&stats->service_bytes); ++ blkg_rwstat_exit(&stats->serviced); ++ blkg_rwstat_exit(&stats->merged); ++ blkg_rwstat_exit(&stats->service_time); ++ blkg_rwstat_exit(&stats->wait_time); ++ blkg_rwstat_exit(&stats->queued); ++ blkg_stat_exit(&stats->sectors); ++ blkg_stat_exit(&stats->time); ++ blkg_stat_exit(&stats->unaccounted_time); ++ blkg_stat_exit(&stats->avg_queue_size_sum); ++ blkg_stat_exit(&stats->avg_queue_size_samples); ++ blkg_stat_exit(&stats->dequeue); ++ blkg_stat_exit(&stats->group_wait_time); ++ blkg_stat_exit(&stats->idle_time); ++ blkg_stat_exit(&stats->empty_time); ++} ++ ++static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) ++{ ++ if (blkg_rwstat_init(&stats->service_bytes, gfp) || ++ blkg_rwstat_init(&stats->serviced, gfp) || ++ blkg_rwstat_init(&stats->merged, gfp) || ++ blkg_rwstat_init(&stats->service_time, gfp) || ++ blkg_rwstat_init(&stats->wait_time, gfp) || ++ blkg_rwstat_init(&stats->queued, gfp) || ++ blkg_stat_init(&stats->sectors, gfp) || ++ blkg_stat_init(&stats->time, gfp) || ++ blkg_stat_init(&stats->unaccounted_time, gfp) || ++ blkg_stat_init(&stats->avg_queue_size_sum, gfp) || ++ blkg_stat_init(&stats->avg_queue_size_samples, gfp) || ++ blkg_stat_init(&stats->dequeue, gfp) || ++ blkg_stat_init(&stats->group_wait_time, gfp) || ++ blkg_stat_init(&stats->idle_time, gfp) || ++ blkg_stat_init(&stats->empty_time, gfp)) { ++ bfqg_stats_exit(stats); ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) ++ { ++ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; ++ } ++ ++static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) ++{ ++ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); ++} ++ ++static void bfq_cpd_init(struct blkcg_policy_data *cpd) ++{ ++ struct bfq_group_data *d = cpd_to_bfqgd(cpd); ++ ++ d->weight = BFQ_DEFAULT_GRP_WEIGHT; ++} ++ ++static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) ++{ ++ struct bfq_group *bfqg; ++ ++ bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); ++ if (!bfqg) ++ return NULL; ++ ++ if (bfqg_stats_init(&bfqg->stats, gfp) || ++ bfqg_stats_init(&bfqg->dead_stats, gfp)) { ++ kfree(bfqg); ++ return NULL; ++ } ++ ++ return &bfqg->pd; ++} ++ ++static void bfq_group_set_parent(struct bfq_group *bfqg, ++ struct bfq_group *parent) ++{ ++ struct bfq_entity *entity; ++ ++ BUG_ON(!parent); ++ BUG_ON(!bfqg); ++ BUG_ON(bfqg == parent); ++ ++ entity = &bfqg->entity; ++ entity->parent = parent->my_entity; ++ entity->sched_data = &parent->sched_data; ++} ++ ++static void bfq_pd_init(struct blkg_policy_data *pd) ++{ ++ struct blkcg_gq *blkg = pd_to_blkg(pd); ++ struct bfq_group *bfqg = blkg_to_bfqg(blkg); ++ struct bfq_data *bfqd = blkg->q->elevator->elevator_data; ++ struct bfq_entity *entity = &bfqg->entity; ++ struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); ++ ++ entity->orig_weight = entity->weight = entity->new_weight = d->weight; ++ entity->my_sched_data = &bfqg->sched_data; ++ bfqg->my_entity = entity; /* ++ * the root_group's will be set to NULL ++ * in bfq_init_queue() ++ */ ++ bfqg->bfqd = bfqd; ++ bfqg->active_entities = 0; ++} ++ ++static void bfq_pd_free(struct blkg_policy_data *pd) ++{ ++ struct bfq_group *bfqg = pd_to_bfqg(pd); ++ ++ bfqg_stats_exit(&bfqg->stats); ++ bfqg_stats_exit(&bfqg->dead_stats); ++ ++ return kfree(bfqg); ++} ++ ++/* offset delta from bfqg->stats to bfqg->dead_stats */ ++static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) - ++ offsetof(struct bfq_group, stats); ++ ++/* to be used by recursive prfill, sums live and dead stats recursively */ ++static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) ++{ ++ u64 sum = 0; ++ ++ sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); ++ sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, ++ off + dead_stats_off_delta); ++ return sum; ++} ++ ++/* to be used by recursive prfill, sums live and dead rwstats recursively */ ++static struct blkg_rwstat bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, ++ int off) ++{ ++ struct blkg_rwstat a, b; ++ ++ a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); ++ b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, ++ off + dead_stats_off_delta); ++ blkg_rwstat_add_aux(&a, &b); ++ return a; ++} ++ ++static void bfq_pd_reset_stats(struct blkg_policy_data *pd) ++{ ++ struct bfq_group *bfqg = pd_to_bfqg(pd); ++ ++ bfqg_stats_reset(&bfqg->stats); ++ bfqg_stats_reset(&bfqg->dead_stats); ++} ++ ++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, ++ struct blkcg *blkcg) ++{ ++ struct request_queue *q = bfqd->queue; ++ struct bfq_group *bfqg = NULL, *parent; ++ struct bfq_entity *entity = NULL; ++ ++ assert_spin_locked(bfqd->queue->queue_lock); ++ ++ /* avoid lookup for the common case where there's no blkcg */ ++ if (blkcg == &blkcg_root) { ++ bfqg = bfqd->root_group; ++ } else { ++ struct blkcg_gq *blkg; ++ ++ blkg = blkg_lookup_create(blkcg, q); ++ if (!IS_ERR(blkg)) ++ bfqg = blkg_to_bfqg(blkg); ++ else /* fallback to root_group */ ++ bfqg = bfqd->root_group; ++ } ++ ++ BUG_ON(!bfqg); ++ ++ /* ++ * Update chain of bfq_groups as we might be handling a leaf group ++ * which, along with some of its relatives, has not been hooked yet ++ * to the private hierarchy of BFQ. ++ */ ++ entity = &bfqg->entity; ++ for_each_entity(entity) { ++ bfqg = container_of(entity, struct bfq_group, entity); ++ BUG_ON(!bfqg); ++ if (bfqg != bfqd->root_group) { ++ parent = bfqg_parent(bfqg); ++ if (!parent) ++ parent = bfqd->root_group; ++ BUG_ON(!parent); ++ bfq_group_set_parent(bfqg, parent); ++ } ++ } ++ ++ return bfqg; ++} ++ ++/** ++ * bfq_bfqq_move - migrate @bfqq to @bfqg. ++ * @bfqd: queue descriptor. ++ * @bfqq: the queue to move. ++ * @entity: @bfqq's entity. ++ * @bfqg: the group to move to. ++ * ++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating ++ * it on the new one. Avoid putting the entity on the old group idle tree. ++ * ++ * Must be called under the queue lock; the cgroup owning @bfqg must ++ * not disappear (by now this just means that we are called under ++ * rcu_read_lock()). ++ */ ++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct bfq_entity *entity, struct bfq_group *bfqg) ++{ ++ int busy, resume; ++ ++ busy = bfq_bfqq_busy(bfqq); ++ resume = !RB_EMPTY_ROOT(&bfqq->sort_list); ++ ++ BUG_ON(resume && !entity->on_st); ++ BUG_ON(busy && !resume && entity->on_st && ++ bfqq != bfqd->in_service_queue); ++ ++ if (busy) { ++ BUG_ON(atomic_read(&bfqq->ref) < 2); ++ ++ if (!resume) ++ bfq_del_bfqq_busy(bfqd, bfqq, 0); ++ else ++ bfq_deactivate_bfqq(bfqd, bfqq, 0); ++ } else if (entity->on_st) ++ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); ++ bfqg_put(bfqq_group(bfqq)); ++ ++ /* ++ * Here we use a reference to bfqg. We don't need a refcounter ++ * as the cgroup reference will not be dropped, so that its ++ * destroy() callback will not be invoked. ++ */ ++ entity->parent = bfqg->my_entity; ++ entity->sched_data = &bfqg->sched_data; ++ bfqg_get(bfqg); ++ ++ if (busy) { ++ if (resume) ++ bfq_activate_bfqq(bfqd, bfqq); ++ } ++ ++ if (!bfqd->in_service_queue && !bfqd->rq_in_driver) ++ bfq_schedule_dispatch(bfqd); ++} ++ ++/** ++ * __bfq_bic_change_cgroup - move @bic to @cgroup. ++ * @bfqd: the queue descriptor. ++ * @bic: the bic to move. ++ * @blkcg: the blk-cgroup to move to. ++ * ++ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller ++ * has to make sure that the reference to cgroup is valid across the call. ++ * ++ * NOTE: an alternative approach might have been to store the current ++ * cgroup in bfqq and getting a reference to it, reducing the lookup ++ * time here, at the price of slightly more complex code. ++ */ ++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, ++ struct bfq_io_cq *bic, ++ struct blkcg *blkcg) ++{ ++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); ++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); ++ struct bfq_group *bfqg; ++ struct bfq_entity *entity; ++ ++ lockdep_assert_held(bfqd->queue->queue_lock); ++ ++ bfqg = bfq_find_alloc_group(bfqd, blkcg); ++ if (async_bfqq) { ++ entity = &async_bfqq->entity; ++ ++ if (entity->sched_data != &bfqg->sched_data) { ++ bic_set_bfqq(bic, NULL, 0); ++ bfq_log_bfqq(bfqd, async_bfqq, ++ "bic_change_group: %p %d", ++ async_bfqq, atomic_read(&async_bfqq->ref)); ++ bfq_put_queue(async_bfqq); ++ } ++ } ++ ++ if (sync_bfqq) { ++ entity = &sync_bfqq->entity; ++ if (entity->sched_data != &bfqg->sched_data) ++ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); ++ } ++ ++ return bfqg; ++} ++ ++static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) ++{ ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ struct blkcg *blkcg; ++ struct bfq_group *bfqg = NULL; ++ uint64_t id; ++ ++ rcu_read_lock(); ++ blkcg = bio_blkcg(bio); ++ id = blkcg->css.serial_nr; ++ rcu_read_unlock(); ++ ++ /* ++ * Check whether blkcg has changed. The condition may trigger ++ * spuriously on a newly created cic but there's no harm. ++ */ ++ if (unlikely(!bfqd) || likely(bic->blkcg_id == id)) ++ return; ++ ++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg); ++ BUG_ON(!bfqg); ++ bic->blkcg_id = id; ++} ++ ++/** ++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. ++ * @st: the service tree being flushed. ++ */ ++static void bfq_flush_idle_tree(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *entity = st->first_idle; ++ ++ for (; entity ; entity = st->first_idle) ++ __bfq_deactivate_entity(entity, 0); ++} ++ ++/** ++ * bfq_reparent_leaf_entity - move leaf entity to the root_group. ++ * @bfqd: the device data structure with the root group. ++ * @entity: the entity to move. ++ */ ++static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ BUG_ON(!bfqq); ++ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); ++ return; ++} ++ ++/** ++ * bfq_reparent_active_entities - move to the root group all active ++ * entities. ++ * @bfqd: the device data structure with the root group. ++ * @bfqg: the group to move from. ++ * @st: the service tree with the entities. ++ * ++ * Needs queue_lock to be taken and reference to be valid over the call. ++ */ ++static void bfq_reparent_active_entities(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ struct bfq_service_tree *st) ++{ ++ struct rb_root *active = &st->active; ++ struct bfq_entity *entity = NULL; ++ ++ if (!RB_EMPTY_ROOT(&st->active)) ++ entity = bfq_entity_of(rb_first(active)); ++ ++ for (; entity ; entity = bfq_entity_of(rb_first(active))) ++ bfq_reparent_leaf_entity(bfqd, entity); ++ ++ if (bfqg->sched_data.in_service_entity) ++ bfq_reparent_leaf_entity(bfqd, ++ bfqg->sched_data.in_service_entity); ++ ++ return; ++} ++ ++/** ++ * bfq_destroy_group - destroy @bfqg. ++ * @bfqg: the group being destroyed. ++ * ++ * Destroy @bfqg, making sure that it is not referenced from its parent. ++ * blkio already grabs the queue_lock for us, so no need to use RCU-based magic ++ */ ++static void bfq_pd_offline(struct blkg_policy_data *pd) ++{ ++ struct bfq_service_tree *st; ++ struct bfq_group *bfqg; ++ struct bfq_data *bfqd; ++ struct bfq_entity *entity; ++ int i; ++ ++ BUG_ON(!pd); ++ bfqg = pd_to_bfqg(pd); ++ BUG_ON(!bfqg); ++ bfqd = bfqg->bfqd; ++ BUG_ON(bfqd && !bfqd->root_group); ++ ++ entity = bfqg->my_entity; ++ ++ if (!entity) /* root group */ ++ return; ++ ++ /* ++ * Empty all service_trees belonging to this group before ++ * deactivating the group itself. ++ */ ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { ++ BUG_ON(!bfqg->sched_data.service_tree); ++ st = bfqg->sched_data.service_tree + i; ++ /* ++ * The idle tree may still contain bfq_queues belonging ++ * to exited task because they never migrated to a different ++ * cgroup from the one being destroyed now. No one else ++ * can access them so it's safe to act without any lock. ++ */ ++ bfq_flush_idle_tree(st); ++ ++ /* ++ * It may happen that some queues are still active ++ * (busy) upon group destruction (if the corresponding ++ * processes have been forced to terminate). We move ++ * all the leaf entities corresponding to these queues ++ * to the root_group. ++ * Also, it may happen that the group has an entity ++ * in service, which is disconnected from the active ++ * tree: it must be moved, too. ++ * There is no need to put the sync queues, as the ++ * scheduler has taken no reference. ++ */ ++ bfq_reparent_active_entities(bfqd, bfqg, st); ++ BUG_ON(!RB_EMPTY_ROOT(&st->active)); ++ BUG_ON(!RB_EMPTY_ROOT(&st->idle)); ++ } ++ BUG_ON(bfqg->sched_data.next_in_service); ++ BUG_ON(bfqg->sched_data.in_service_entity); ++ ++ __bfq_deactivate_entity(entity, 0); ++ bfq_put_async_queues(bfqd, bfqg); ++ BUG_ON(entity->tree); ++ ++ bfqg_stats_xfer_dead(bfqg); ++} ++ ++static void bfq_end_wr_async(struct bfq_data *bfqd) ++{ ++ struct blkcg_gq *blkg; ++ ++ list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { ++ struct bfq_group *bfqg = blkg_to_bfqg(blkg); ++ ++ bfq_end_wr_async_queues(bfqd, bfqg); ++ } ++ bfq_end_wr_async_queues(bfqd, bfqd->root_group); ++} ++ ++static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css, ++ struct cftype *cftype) ++{ ++ struct blkcg *blkcg = css_to_blkcg(css); ++ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); ++ int ret = -EINVAL; ++ ++ spin_lock_irq(&blkcg->lock); ++ ret = bfqgd->weight; ++ spin_unlock_irq(&blkcg->lock); ++ ++ return ret; ++} ++ ++static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v) ++{ ++ struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); ++ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); ++ ++ spin_lock_irq(&blkcg->lock); ++ seq_printf(sf, "%u\n", bfqgd->weight); ++ spin_unlock_irq(&blkcg->lock); ++ ++ return 0; ++} ++ ++static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, ++ struct cftype *cftype, ++ u64 val) ++{ ++ struct blkcg *blkcg = css_to_blkcg(css); ++ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); ++ struct blkcg_gq *blkg; ++ int ret = -EINVAL; ++ ++ if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) ++ return ret; ++ ++ ret = 0; ++ spin_lock_irq(&blkcg->lock); ++ bfqgd->weight = (unsigned short)val; ++ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { ++ struct bfq_group *bfqg = blkg_to_bfqg(blkg); ++ if (!bfqg) ++ continue; ++ /* ++ * Setting the prio_changed flag of the entity ++ * to 1 with new_weight == weight would re-set ++ * the value of the weight to its ioprio mapping. ++ * Set the flag only if necessary. ++ */ ++ if ((unsigned short)val != bfqg->entity.new_weight) { ++ bfqg->entity.new_weight = (unsigned short)val; ++ /* ++ * Make sure that the above new value has been ++ * stored in bfqg->entity.new_weight before ++ * setting the prio_changed flag. In fact, ++ * this flag may be read asynchronously (in ++ * critical sections protected by a different ++ * lock than that held here), and finding this ++ * flag set may cause the execution of the code ++ * for updating parameters whose value may ++ * depend also on bfqg->entity.new_weight (in ++ * __bfq_entity_update_weight_prio). ++ * This barrier makes sure that the new value ++ * of bfqg->entity.new_weight is correctly ++ * seen in that code. ++ */ ++ smp_wmb(); ++ bfqg->entity.prio_changed = 1; ++ } ++ } ++ spin_unlock_irq(&blkcg->lock); ++ ++ return ret; ++} ++ ++static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of, ++ char *buf, size_t nbytes, ++ loff_t off) ++{ ++ /* First unsigned long found in the file is used */ ++ return bfqio_cgroup_weight_write(of_css(of), NULL, ++ simple_strtoull(strim(buf), NULL, 0)); ++} ++ ++static int bfqg_print_stat(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, ++ &blkcg_policy_bfq, seq_cft(sf)->private, false); ++ return 0; ++} ++ ++static int bfqg_print_rwstat(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, ++ &blkcg_policy_bfq, seq_cft(sf)->private, true); ++ return 0; ++} ++ ++static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, ++ struct blkg_policy_data *pd, int off) ++{ ++ u64 sum = bfqg_stat_pd_recursive_sum(pd, off); ++ ++ return __blkg_prfill_u64(sf, pd, sum); ++} ++ ++static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, ++ struct blkg_policy_data *pd, int off) ++{ ++ struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off); ++ ++ return __blkg_prfill_rwstat(sf, pd, &sum); ++} ++ ++static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_stat_recursive, &blkcg_policy_bfq, ++ seq_cft(sf)->private, false); ++ return 0; ++} ++ ++static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, ++ seq_cft(sf)->private, true); ++ return 0; ++} ++ ++static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, ++ struct blkg_policy_data *pd, int off) ++{ ++ struct bfq_group *bfqg = pd_to_bfqg(pd); ++ u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); ++ u64 v = 0; ++ ++ if (samples) { ++ v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); ++ v = div64_u64(v, samples); ++ } ++ __blkg_prfill_u64(sf, pd, v); ++ return 0; ++} ++ ++/* print avg_queue_size */ ++static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, ++ 0, false); ++ return 0; ++} ++ ++static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) ++{ ++ int ret; ++ ++ ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); ++ if (ret) ++ return NULL; ++ ++ return blkg_to_bfqg(bfqd->queue->root_blkg); ++} ++ ++static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) ++{ ++ struct bfq_group_data *bgd; ++ ++ bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); ++ if (!bgd) ++ return NULL; ++ return &bgd->pd; ++} ++ ++static void bfq_cpd_free(struct blkcg_policy_data *cpd) ++{ ++ kfree(cpd_to_bfqgd(cpd)); ++} ++ ++static struct cftype bfqio_files_dfl[] = { ++ { ++ .name = "weight", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .seq_show = bfqio_cgroup_weight_read_dfl, ++ .write = bfqio_cgroup_weight_write_dfl, ++ }, ++ {} /* terminate */ ++}; ++ ++static struct cftype bfqio_files[] = { ++ { ++ .name = "bfq.weight", ++ .read_u64 = bfqio_cgroup_weight_read, ++ .write_u64 = bfqio_cgroup_weight_write, ++ }, ++ /* statistics, cover only the tasks in the bfqg */ ++ { ++ .name = "bfq.time", ++ .private = offsetof(struct bfq_group, stats.time), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = "bfq.sectors", ++ .private = offsetof(struct bfq_group, stats.sectors), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = "bfq.io_service_bytes", ++ .private = offsetof(struct bfq_group, stats.service_bytes), ++ .seq_show = bfqg_print_rwstat, ++ }, ++ { ++ .name = "bfq.io_serviced", ++ .private = offsetof(struct bfq_group, stats.serviced), ++ .seq_show = bfqg_print_rwstat, ++ }, ++ { ++ .name = "bfq.io_service_time", ++ .private = offsetof(struct bfq_group, stats.service_time), ++ .seq_show = bfqg_print_rwstat, ++ }, ++ { ++ .name = "bfq.io_wait_time", ++ .private = offsetof(struct bfq_group, stats.wait_time), ++ .seq_show = bfqg_print_rwstat, ++ }, ++ { ++ .name = "bfq.io_merged", ++ .private = offsetof(struct bfq_group, stats.merged), ++ .seq_show = bfqg_print_rwstat, ++ }, ++ { ++ .name = "bfq.io_queued", ++ .private = offsetof(struct bfq_group, stats.queued), ++ .seq_show = bfqg_print_rwstat, ++ }, ++ ++ /* the same statictics which cover the bfqg and its descendants */ ++ { ++ .name = "bfq.time_recursive", ++ .private = offsetof(struct bfq_group, stats.time), ++ .seq_show = bfqg_print_stat_recursive, ++ }, ++ { ++ .name = "bfq.sectors_recursive", ++ .private = offsetof(struct bfq_group, stats.sectors), ++ .seq_show = bfqg_print_stat_recursive, ++ }, ++ { ++ .name = "bfq.io_service_bytes_recursive", ++ .private = offsetof(struct bfq_group, stats.service_bytes), ++ .seq_show = bfqg_print_rwstat_recursive, ++ }, ++ { ++ .name = "bfq.io_serviced_recursive", ++ .private = offsetof(struct bfq_group, stats.serviced), ++ .seq_show = bfqg_print_rwstat_recursive, ++ }, ++ { ++ .name = "bfq.io_service_time_recursive", ++ .private = offsetof(struct bfq_group, stats.service_time), ++ .seq_show = bfqg_print_rwstat_recursive, ++ }, ++ { ++ .name = "bfq.io_wait_time_recursive", ++ .private = offsetof(struct bfq_group, stats.wait_time), ++ .seq_show = bfqg_print_rwstat_recursive, ++ }, ++ { ++ .name = "bfq.io_merged_recursive", ++ .private = offsetof(struct bfq_group, stats.merged), ++ .seq_show = bfqg_print_rwstat_recursive, ++ }, ++ { ++ .name = "bfq.io_queued_recursive", ++ .private = offsetof(struct bfq_group, stats.queued), ++ .seq_show = bfqg_print_rwstat_recursive, ++ }, ++ { ++ .name = "bfq.avg_queue_size", ++ .seq_show = bfqg_print_avg_queue_size, ++ }, ++ { ++ .name = "bfq.group_wait_time", ++ .private = offsetof(struct bfq_group, stats.group_wait_time), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = "bfq.idle_time", ++ .private = offsetof(struct bfq_group, stats.idle_time), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = "bfq.empty_time", ++ .private = offsetof(struct bfq_group, stats.empty_time), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = "bfq.dequeue", ++ .private = offsetof(struct bfq_group, stats.dequeue), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = "bfq.unaccounted_time", ++ .private = offsetof(struct bfq_group, stats.unaccounted_time), ++ .seq_show = bfqg_print_stat, ++ }, ++ { } /* terminate */ ++}; ++ ++static struct blkcg_policy blkcg_policy_bfq = { ++ .dfl_cftypes = bfqio_files_dfl, ++ .legacy_cftypes = bfqio_files, ++ ++ .pd_alloc_fn = bfq_pd_alloc, ++ .pd_init_fn = bfq_pd_init, ++ .pd_offline_fn = bfq_pd_offline, ++ .pd_free_fn = bfq_pd_free, ++ .pd_reset_stats_fn = bfq_pd_reset_stats, ++ ++ .cpd_alloc_fn = bfq_cpd_alloc, ++ .cpd_init_fn = bfq_cpd_init, ++ .cpd_bind_fn = bfq_cpd_init, ++ .cpd_free_fn = bfq_cpd_free, ++ ++}; ++ ++#else ++ ++static void bfq_init_entity(struct bfq_entity *entity, ++ struct bfq_group *bfqg) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ entity->weight = entity->new_weight; ++ entity->orig_weight = entity->new_weight; ++ if (bfqq) { ++ bfqq->ioprio = bfqq->new_ioprio; ++ bfqq->ioprio_class = bfqq->new_ioprio_class; ++ } ++ entity->sched_data = &bfqg->sched_data; ++} ++ ++static struct bfq_group * ++bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) ++{ ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ return bfqd->root_group; ++} ++ ++static void bfq_bfqq_move(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct bfq_entity *entity, ++ struct bfq_group *bfqg) ++{ ++} ++ ++static void bfq_end_wr_async(struct bfq_data *bfqd) ++{ ++ bfq_end_wr_async_queues(bfqd, bfqd->root_group); ++} ++ ++static void bfq_disconnect_groups(struct bfq_data *bfqd) ++{ ++ bfq_put_async_queues(bfqd, bfqd->root_group); ++} ++ ++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, ++ struct blkcg *blkcg) ++{ ++ return bfqd->root_group; ++} ++ ++static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) ++{ ++ struct bfq_group *bfqg; ++ int i; ++ ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); ++ if (!bfqg) ++ return NULL; ++ ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; ++ ++ return bfqg; ++} ++#endif +diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c +new file mode 100644 +index 0000000..fb7bb8f +--- /dev/null ++++ b/block/bfq-ioc.c +@@ -0,0 +1,36 @@ ++/* ++ * BFQ: I/O context handling. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2010 Paolo Valente ++ */ ++ ++/** ++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. ++ * @icq: the iocontext queue. ++ */ ++static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) ++{ ++ /* bic->icq is the first member, %NULL will convert to %NULL */ ++ return container_of(icq, struct bfq_io_cq, icq); ++} ++ ++/** ++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. ++ * @bfqd: the lookup key. ++ * @ioc: the io_context of the process doing I/O. ++ * ++ * Queue lock must be held. ++ */ ++static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, ++ struct io_context *ioc) ++{ ++ if (ioc) ++ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); ++ return NULL; ++} +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c +new file mode 100644 +index 0000000..f9787a6 +--- /dev/null ++++ b/block/bfq-iosched.c +@@ -0,0 +1,3754 @@ ++/* ++ * Budget Fair Queueing (BFQ) disk scheduler. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2010 Paolo Valente ++ * ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ ++ * file. ++ * ++ * BFQ is a proportional-share storage-I/O scheduling algorithm based on ++ * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, ++ * measured in number of sectors, to processes instead of time slices. The ++ * device is not granted to the in-service process for a given time slice, ++ * but until it has exhausted its assigned budget. This change from the time ++ * to the service domain allows BFQ to distribute the device throughput ++ * among processes as desired, without any distortion due to ZBR, workload ++ * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, ++ * called B-WF2Q+, to schedule processes according to their budgets. More ++ * precisely, BFQ schedules queues associated to processes. Thanks to the ++ * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to ++ * I/O-bound processes issuing sequential requests (to boost the ++ * throughput), and yet guarantee a low latency to interactive and soft ++ * real-time applications. ++ * ++ * BFQ is described in [1], where also a reference to the initial, more ++ * theoretical paper on BFQ can be found. The interested reader can find ++ * in the latter paper full details on the main algorithm, as well as ++ * formulas of the guarantees and formal proofs of all the properties. ++ * With respect to the version of BFQ presented in these papers, this ++ * implementation adds a few more heuristics, such as the one that ++ * guarantees a low latency to soft real-time applications, and a ++ * hierarchical extension based on H-WF2Q+. ++ * ++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with ++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) ++ * complexity derives from the one introduced with EEVDF in [3]. ++ * ++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness ++ * with the BFQ Disk I/O Scheduler'', ++ * Proceedings of the 5th Annual International Systems and Storage ++ * Conference (SYSTOR '12), June 2012. ++ * ++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf ++ * ++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing ++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, ++ * Oct 1997. ++ * ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz ++ * ++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline ++ * First: A Flexible and Accurate Mechanism for Proportional Share ++ * Resource Allocation,'' technical report. ++ * ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "bfq.h" ++#include "blk.h" ++ ++/* Expiration time of sync (0) and async (1) requests, in jiffies. */ ++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; ++ ++/* Maximum backwards seek, in KiB. */ ++static const int bfq_back_max = 16 * 1024; ++ ++/* Penalty of a backwards seek, in number of sectors. */ ++static const int bfq_back_penalty = 2; ++ ++/* Idling period duration, in jiffies. */ ++static int bfq_slice_idle = HZ / 125; ++ ++/* Minimum number of assigned budgets for which stats are safe to compute. */ ++static const int bfq_stats_min_budgets = 194; ++ ++/* Default maximum budget values, in sectors and number of requests. */ ++static const int bfq_default_max_budget = 16 * 1024; ++static const int bfq_max_budget_async_rq = 4; ++ ++/* ++ * Async to sync throughput distribution is controlled as follows: ++ * when an async request is served, the entity is charged the number ++ * of sectors of the request, multiplied by the factor below ++ */ ++static const int bfq_async_charge_factor = 10; ++ ++/* Default timeout values, in jiffies, approximating CFQ defaults. */ ++static const int bfq_timeout_sync = HZ / 8; ++static int bfq_timeout_async = HZ / 25; ++ ++struct kmem_cache *bfq_pool; ++ ++/* Below this threshold (in ms), we consider thinktime immediate. */ ++#define BFQ_MIN_TT 2 ++ ++/* hw_tag detection: parallel requests threshold and min samples needed. */ ++#define BFQ_HW_QUEUE_THRESHOLD 4 ++#define BFQ_HW_QUEUE_SAMPLES 32 ++ ++#define BFQQ_SEEK_THR (sector_t)(8 * 1024) ++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) ++ ++/* Min samples used for peak rate estimation (for autotuning). */ ++#define BFQ_PEAK_RATE_SAMPLES 32 ++ ++/* Shift used for peak rate fixed precision calculations. */ ++#define BFQ_RATE_SHIFT 16 ++ ++/* ++ * By default, BFQ computes the duration of the weight raising for ++ * interactive applications automatically, using the following formula: ++ * duration = (R / r) * T, where r is the peak rate of the device, and ++ * R and T are two reference parameters. ++ * In particular, R is the peak rate of the reference device (see below), ++ * and T is a reference time: given the systems that are likely to be ++ * installed on the reference device according to its speed class, T is ++ * about the maximum time needed, under BFQ and while reading two files in ++ * parallel, to load typical large applications on these systems. ++ * In practice, the slower/faster the device at hand is, the more/less it ++ * takes to load applications with respect to the reference device. ++ * Accordingly, the longer/shorter BFQ grants weight raising to interactive ++ * applications. ++ * ++ * BFQ uses four different reference pairs (R, T), depending on: ++ * . whether the device is rotational or non-rotational; ++ * . whether the device is slow, such as old or portable HDDs, as well as ++ * SD cards, or fast, such as newer HDDs and SSDs. ++ * ++ * The device's speed class is dynamically (re)detected in ++ * bfq_update_peak_rate() every time the estimated peak rate is updated. ++ * ++ * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0] ++ * are the reference values for a slow/fast rotational device, whereas ++ * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for ++ * a slow/fast non-rotational device. Finally, device_speed_thresh are the ++ * thresholds used to switch between speed classes. ++ * Both the reference peak rates and the thresholds are measured in ++ * sectors/usec, left-shifted by BFQ_RATE_SHIFT. ++ */ ++static int R_slow[2] = {1536, 10752}; ++static int R_fast[2] = {17415, 34791}; ++/* ++ * To improve readability, a conversion function is used to initialize the ++ * following arrays, which entails that they can be initialized only in a ++ * function. ++ */ ++static int T_slow[2]; ++static int T_fast[2]; ++static int device_speed_thresh[2]; ++ ++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ ++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) ++ ++#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) ++#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) ++ ++static void bfq_schedule_dispatch(struct bfq_data *bfqd); ++ ++#include "bfq-ioc.c" ++#include "bfq-sched.c" ++#include "bfq-cgroup.c" ++ ++#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) ++#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) ++ ++#define bfq_sample_valid(samples) ((samples) > 80) ++ ++/* ++ * We regard a request as SYNC, if either it's a read or has the SYNC bit ++ * set (in which case it could also be a direct WRITE). ++ */ ++static int bfq_bio_sync(struct bio *bio) ++{ ++ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) ++ return 1; ++ ++ return 0; ++} ++ ++/* ++ * Scheduler run of queue, if there are requests pending and no one in the ++ * driver that will restart queueing. ++ */ ++static void bfq_schedule_dispatch(struct bfq_data *bfqd) ++{ ++ if (bfqd->queued != 0) { ++ bfq_log(bfqd, "schedule dispatch"); ++ kblockd_schedule_work(&bfqd->unplug_work); ++ } ++} ++ ++/* ++ * Lifted from AS - choose which of rq1 and rq2 that is best served now. ++ * We choose the request that is closesr to the head right now. Distance ++ * behind the head is penalized and only allowed to a certain extent. ++ */ ++static struct request *bfq_choose_req(struct bfq_data *bfqd, ++ struct request *rq1, ++ struct request *rq2, ++ sector_t last) ++{ ++ sector_t s1, s2, d1 = 0, d2 = 0; ++ unsigned long back_max; ++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ ++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ ++ unsigned wrap = 0; /* bit mask: requests behind the disk head? */ ++ ++ if (!rq1 || rq1 == rq2) ++ return rq2; ++ if (!rq2) ++ return rq1; ++ ++ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) ++ return rq1; ++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) ++ return rq2; ++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) ++ return rq1; ++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) ++ return rq2; ++ ++ s1 = blk_rq_pos(rq1); ++ s2 = blk_rq_pos(rq2); ++ ++ /* ++ * By definition, 1KiB is 2 sectors. ++ */ ++ back_max = bfqd->bfq_back_max * 2; ++ ++ /* ++ * Strict one way elevator _except_ in the case where we allow ++ * short backward seeks which are biased as twice the cost of a ++ * similar forward seek. ++ */ ++ if (s1 >= last) ++ d1 = s1 - last; ++ else if (s1 + back_max >= last) ++ d1 = (last - s1) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ1_WRAP; ++ ++ if (s2 >= last) ++ d2 = s2 - last; ++ else if (s2 + back_max >= last) ++ d2 = (last - s2) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ2_WRAP; ++ ++ /* Found required data */ ++ ++ /* ++ * By doing switch() on the bit mask "wrap" we avoid having to ++ * check two variables for all permutations: --> faster! ++ */ ++ switch (wrap) { ++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ ++ if (d1 < d2) ++ return rq1; ++ else if (d2 < d1) ++ return rq2; ++ else { ++ if (s1 >= s2) ++ return rq1; ++ else ++ return rq2; ++ } ++ ++ case BFQ_RQ2_WRAP: ++ return rq1; ++ case BFQ_RQ1_WRAP: ++ return rq2; ++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ ++ default: ++ /* ++ * Since both rqs are wrapped, ++ * start with the one that's further behind head ++ * (--> only *one* back seek required), ++ * since back seek takes more time than forward. ++ */ ++ if (s1 <= s2) ++ return rq1; ++ else ++ return rq2; ++ } ++} ++ ++/* ++ * Tell whether there are active queues or groups with differentiated weights. ++ */ ++static bool bfq_differentiated_weights(struct bfq_data *bfqd) ++{ ++ /* ++ * For weights to differ, at least one of the trees must contain ++ * at least two nodes. ++ */ ++ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && ++ (bfqd->queue_weights_tree.rb_node->rb_left || ++ bfqd->queue_weights_tree.rb_node->rb_right) ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ ) || ++ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && ++ (bfqd->group_weights_tree.rb_node->rb_left || ++ bfqd->group_weights_tree.rb_node->rb_right) ++#endif ++ ); ++} ++ ++/* ++ * The following function returns true if every queue must receive the ++ * same share of the throughput (this condition is used when deciding ++ * whether idling may be disabled, see the comments in the function ++ * bfq_bfqq_may_idle()). ++ * ++ * Such a scenario occurs when: ++ * 1) all active queues have the same weight, ++ * 2) all active groups at the same level in the groups tree have the same ++ * weight, ++ * 3) all active groups at the same level in the groups tree have the same ++ * number of children. ++ * ++ * Unfortunately, keeping the necessary state for evaluating exactly the ++ * above symmetry conditions would be quite complex and time-consuming. ++ * Therefore this function evaluates, instead, the following stronger ++ * sub-conditions, for which it is much easier to maintain the needed ++ * state: ++ * 1) all active queues have the same weight, ++ * 2) all active groups have the same weight, ++ * 3) all active groups have at most one active child each. ++ * In particular, the last two conditions are always true if hierarchical ++ * support and the cgroups interface are not enabled, thus no state needs ++ * to be maintained in this case. ++ */ ++static bool bfq_symmetric_scenario(struct bfq_data *bfqd) ++{ ++ return ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ !bfqd->active_numerous_groups && ++#endif ++ !bfq_differentiated_weights(bfqd); ++} ++ ++/* ++ * If the weight-counter tree passed as input contains no counter for ++ * the weight of the input entity, then add that counter; otherwise just ++ * increment the existing counter. ++ * ++ * Note that weight-counter trees contain few nodes in mostly symmetric ++ * scenarios. For example, if all queues have the same weight, then the ++ * weight-counter tree for the queues may contain at most one node. ++ * This holds even if low_latency is on, because weight-raised queues ++ * are not inserted in the tree. ++ * In most scenarios, the rate at which nodes are created/destroyed ++ * should be low too. ++ */ ++static void bfq_weights_tree_add(struct bfq_data *bfqd, ++ struct bfq_entity *entity, ++ struct rb_root *root) ++{ ++ struct rb_node **new = &(root->rb_node), *parent = NULL; ++ ++ /* ++ * Do not insert if the entity is already associated with a ++ * counter, which happens if: ++ * 1) the entity is associated with a queue, ++ * 2) a request arrival has caused the queue to become both ++ * non-weight-raised, and hence change its weight, and ++ * backlogged; in this respect, each of the two events ++ * causes an invocation of this function, ++ * 3) this is the invocation of this function caused by the ++ * second event. This second invocation is actually useless, ++ * and we handle this fact by exiting immediately. More ++ * efficient or clearer solutions might possibly be adopted. ++ */ ++ if (entity->weight_counter) ++ return; ++ ++ while (*new) { ++ struct bfq_weight_counter *__counter = container_of(*new, ++ struct bfq_weight_counter, ++ weights_node); ++ parent = *new; ++ ++ if (entity->weight == __counter->weight) { ++ entity->weight_counter = __counter; ++ goto inc_counter; ++ } ++ if (entity->weight < __counter->weight) ++ new = &((*new)->rb_left); ++ else ++ new = &((*new)->rb_right); ++ } ++ ++ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), ++ GFP_ATOMIC); ++ entity->weight_counter->weight = entity->weight; ++ rb_link_node(&entity->weight_counter->weights_node, parent, new); ++ rb_insert_color(&entity->weight_counter->weights_node, root); ++ ++inc_counter: ++ entity->weight_counter->num_active++; ++} ++ ++/* ++ * Decrement the weight counter associated with the entity, and, if the ++ * counter reaches 0, remove the counter from the tree. ++ * See the comments to the function bfq_weights_tree_add() for considerations ++ * about overhead. ++ */ ++static void bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_entity *entity, ++ struct rb_root *root) ++{ ++ if (!entity->weight_counter) ++ return; ++ ++ BUG_ON(RB_EMPTY_ROOT(root)); ++ BUG_ON(entity->weight_counter->weight != entity->weight); ++ ++ BUG_ON(!entity->weight_counter->num_active); ++ entity->weight_counter->num_active--; ++ if (entity->weight_counter->num_active > 0) ++ goto reset_entity_pointer; ++ ++ rb_erase(&entity->weight_counter->weights_node, root); ++ kfree(entity->weight_counter); ++ ++reset_entity_pointer: ++ entity->weight_counter = NULL; ++} ++ ++static struct request *bfq_find_next_rq(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct request *last) ++{ ++ struct rb_node *rbnext = rb_next(&last->rb_node); ++ struct rb_node *rbprev = rb_prev(&last->rb_node); ++ struct request *next = NULL, *prev = NULL; ++ ++ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); ++ ++ if (rbprev) ++ prev = rb_entry_rq(rbprev); ++ ++ if (rbnext) ++ next = rb_entry_rq(rbnext); ++ else { ++ rbnext = rb_first(&bfqq->sort_list); ++ if (rbnext && rbnext != &last->rb_node) ++ next = rb_entry_rq(rbnext); ++ } ++ ++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); ++} ++ ++/* see the definition of bfq_async_charge_factor for details */ ++static unsigned long bfq_serv_to_charge(struct request *rq, ++ struct bfq_queue *bfqq) ++{ ++ return blk_rq_sectors(rq) * ++ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * ++ bfq_async_charge_factor)); ++} ++ ++/** ++ * bfq_updated_next_req - update the queue after a new next_rq selection. ++ * @bfqd: the device data the queue belongs to. ++ * @bfqq: the queue to update. ++ * ++ * If the first request of a queue changes we make sure that the queue ++ * has enough budget to serve at least its first request (if the ++ * request has grown). We do this because if the queue has not enough ++ * budget for its first request, it has to go through two dispatch ++ * rounds to actually get it dispatched. ++ */ ++static void bfq_updated_next_req(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ struct request *next_rq = bfqq->next_rq; ++ unsigned long new_budget; ++ ++ if (!next_rq) ++ return; ++ ++ if (bfqq == bfqd->in_service_queue) ++ /* ++ * In order not to break guarantees, budgets cannot be ++ * changed after an entity has been selected. ++ */ ++ return; ++ ++ BUG_ON(entity->tree != &st->active); ++ BUG_ON(entity == entity->sched_data->in_service_entity); ++ ++ new_budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ if (entity->budget != new_budget) { ++ entity->budget = new_budget; ++ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", ++ new_budget); ++ bfq_activate_bfqq(bfqd, bfqq); ++ } ++} ++ ++static unsigned int bfq_wr_duration(struct bfq_data *bfqd) ++{ ++ u64 dur; ++ ++ if (bfqd->bfq_wr_max_time > 0) ++ return bfqd->bfq_wr_max_time; ++ ++ dur = bfqd->RT_prod; ++ do_div(dur, bfqd->peak_rate); ++ ++ return dur; ++} ++ ++/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ ++static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_queue *item; ++ struct hlist_node *n; ++ ++ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) ++ hlist_del_init(&item->burst_list_node); ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); ++ bfqd->burst_size = 1; ++} ++ ++/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ ++static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ /* Increment burst size to take into account also bfqq */ ++ bfqd->burst_size++; ++ ++ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { ++ struct bfq_queue *pos, *bfqq_item; ++ struct hlist_node *n; ++ ++ /* ++ * Enough queues have been activated shortly after each ++ * other to consider this burst as large. ++ */ ++ bfqd->large_burst = true; ++ ++ /* ++ * We can now mark all queues in the burst list as ++ * belonging to a large burst. ++ */ ++ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, ++ burst_list_node) ++ bfq_mark_bfqq_in_large_burst(bfqq_item); ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ ++ /* ++ * From now on, and until the current burst finishes, any ++ * new queue being activated shortly after the last queue ++ * was inserted in the burst can be immediately marked as ++ * belonging to a large burst. So the burst list is not ++ * needed any more. Remove it. ++ */ ++ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, ++ burst_list_node) ++ hlist_del_init(&pos->burst_list_node); ++ } else /* burst not yet large: add bfqq to the burst list */ ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); ++} ++ ++/* ++ * If many queues happen to become active shortly after each other, then, ++ * to help the processes associated to these queues get their job done as ++ * soon as possible, it is usually better to not grant either weight-raising ++ * or device idling to these queues. In this comment we describe, firstly, ++ * the reasons why this fact holds, and, secondly, the next function, which ++ * implements the main steps needed to properly mark these queues so that ++ * they can then be treated in a different way. ++ * ++ * As for the terminology, we say that a queue becomes active, i.e., ++ * switches from idle to backlogged, either when it is created (as a ++ * consequence of the arrival of an I/O request), or, if already existing, ++ * when a new request for the queue arrives while the queue is idle. ++ * Bursts of activations, i.e., activations of different queues occurring ++ * shortly after each other, are typically caused by services or applications ++ * that spawn or reactivate many parallel threads/processes. Examples are ++ * systemd during boot or git grep. ++ * ++ * These services or applications benefit mostly from a high throughput: ++ * the quicker the requests of the activated queues are cumulatively served, ++ * the sooner the target job of these queues gets completed. As a consequence, ++ * weight-raising any of these queues, which also implies idling the device ++ * for it, is almost always counterproductive: in most cases it just lowers ++ * throughput. ++ * ++ * On the other hand, a burst of activations may be also caused by the start ++ * of an application that does not consist in a lot of parallel I/O-bound ++ * threads. In fact, with a complex application, the burst may be just a ++ * consequence of the fact that several processes need to be executed to ++ * start-up the application. To start an application as quickly as possible, ++ * the best thing to do is to privilege the I/O related to the application ++ * with respect to all other I/O. Therefore, the best strategy to start as ++ * quickly as possible an application that causes a burst of activations is ++ * to weight-raise all the queues activated during the burst. This is the ++ * exact opposite of the best strategy for the other type of bursts. ++ * ++ * In the end, to take the best action for each of the two cases, the two ++ * types of bursts need to be distinguished. Fortunately, this seems ++ * relatively easy to do, by looking at the sizes of the bursts. In ++ * particular, we found a threshold such that bursts with a larger size ++ * than that threshold are apparently caused only by services or commands ++ * such as systemd or git grep. For brevity, hereafter we call just 'large' ++ * these bursts. BFQ *does not* weight-raise queues whose activations occur ++ * in a large burst. In addition, for each of these queues BFQ performs or ++ * does not perform idling depending on which choice boosts the throughput ++ * most. The exact choice depends on the device and request pattern at ++ * hand. ++ * ++ * Turning back to the next function, it implements all the steps needed ++ * to detect the occurrence of a large burst and to properly mark all the ++ * queues belonging to it (so that they can then be treated in a different ++ * way). This goal is achieved by maintaining a special "burst list" that ++ * holds, temporarily, the queues that belong to the burst in progress. The ++ * list is then used to mark these queues as belonging to a large burst if ++ * the burst does become large. The main steps are the following. ++ * ++ * . when the very first queue is activated, the queue is inserted into the ++ * list (as it could be the first queue in a possible burst) ++ * ++ * . if the current burst has not yet become large, and a queue Q that does ++ * not yet belong to the burst is activated shortly after the last time ++ * at which a new queue entered the burst list, then the function appends ++ * Q to the burst list ++ * ++ * . if, as a consequence of the previous step, the burst size reaches ++ * the large-burst threshold, then ++ * ++ * . all the queues in the burst list are marked as belonging to a ++ * large burst ++ * ++ * . the burst list is deleted; in fact, the burst list already served ++ * its purpose (keeping temporarily track of the queues in a burst, ++ * so as to be able to mark them as belonging to a large burst in the ++ * previous sub-step), and now is not needed any more ++ * ++ * . the device enters a large-burst mode ++ * ++ * . if a queue Q that does not belong to the burst is activated while ++ * the device is in large-burst mode and shortly after the last time ++ * at which a queue either entered the burst list or was marked as ++ * belonging to the current large burst, then Q is immediately marked ++ * as belonging to a large burst. ++ * ++ * . if a queue Q that does not belong to the burst is activated a while ++ * later, i.e., not shortly after, than the last time at which a queue ++ * either entered the burst list or was marked as belonging to the ++ * current large burst, then the current burst is deemed as finished and: ++ * ++ * . the large-burst mode is reset if set ++ * ++ * . the burst list is emptied ++ * ++ * . Q is inserted in the burst list, as Q may be the first queue ++ * in a possible new burst (then the burst list contains just Q ++ * after this step). ++ */ ++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ bool idle_for_long_time) ++{ ++ /* ++ * If bfqq happened to be activated in a burst, but has been idle ++ * for at least as long as an interactive queue, then we assume ++ * that, in the overall I/O initiated in the burst, the I/O ++ * associated to bfqq is finished. So bfqq does not need to be ++ * treated as a queue belonging to a burst anymore. Accordingly, ++ * we reset bfqq's in_large_burst flag if set, and remove bfqq ++ * from the burst list if it's there. We do not decrement instead ++ * burst_size, because the fact that bfqq does not need to belong ++ * to the burst list any more does not invalidate the fact that ++ * bfqq may have been activated during the current burst. ++ */ ++ if (idle_for_long_time) { ++ hlist_del_init(&bfqq->burst_list_node); ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ } ++ ++ /* ++ * If bfqq is already in the burst list or is part of a large ++ * burst, then there is nothing else to do. ++ */ ++ if (!hlist_unhashed(&bfqq->burst_list_node) || ++ bfq_bfqq_in_large_burst(bfqq)) ++ return; ++ ++ /* ++ * If bfqq's activation happens late enough, then the current ++ * burst is finished, and related data structures must be reset. ++ * ++ * In this respect, consider the special case where bfqq is the very ++ * first queue being activated. In this case, last_ins_in_burst is ++ * not yet significant when we get here. But it is easy to verify ++ * that, whether or not the following condition is true, bfqq will ++ * end up being inserted into the burst list. In particular the ++ * list will happen to contain only bfqq. And this is exactly what ++ * has to happen, as bfqq may be the first queue in a possible ++ * burst. ++ */ ++ if (time_is_before_jiffies(bfqd->last_ins_in_burst + ++ bfqd->bfq_burst_interval)) { ++ bfqd->large_burst = false; ++ bfq_reset_burst_list(bfqd, bfqq); ++ return; ++ } ++ ++ /* ++ * If we get here, then bfqq is being activated shortly after the ++ * last queue. So, if the current burst is also large, we can mark ++ * bfqq as belonging to this large burst immediately. ++ */ ++ if (bfqd->large_burst) { ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ return; ++ } ++ ++ /* ++ * If we get here, then a large-burst state has not yet been ++ * reached, but bfqq is being activated shortly after the last ++ * queue. Then we add bfqq to the burst. ++ */ ++ bfq_add_to_burst(bfqd, bfqq); ++} ++ ++static void bfq_add_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_data *bfqd = bfqq->bfqd; ++ struct request *next_rq, *prev; ++ unsigned long old_wr_coeff = bfqq->wr_coeff; ++ bool interactive = false; ++ ++ bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); ++ bfqq->queued[rq_is_sync(rq)]++; ++ bfqd->queued++; ++ ++ elv_rb_add(&bfqq->sort_list, rq); ++ ++ /* ++ * Check if this request is a better next-serve candidate. ++ */ ++ prev = bfqq->next_rq; ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); ++ BUG_ON(!next_rq); ++ bfqq->next_rq = next_rq; ++ ++ if (!bfq_bfqq_busy(bfqq)) { ++ bool soft_rt, in_burst, ++ idle_for_long_time = time_is_before_jiffies( ++ bfqq->budget_timeout + ++ bfqd->bfq_wr_min_idle_time); ++ ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, ++ rq->cmd_flags); ++#endif ++ if (bfq_bfqq_sync(bfqq)) { ++ bool already_in_burst = ++ !hlist_unhashed(&bfqq->burst_list_node) || ++ bfq_bfqq_in_large_burst(bfqq); ++ bfq_handle_burst(bfqd, bfqq, idle_for_long_time); ++ /* ++ * If bfqq was not already in the current burst, ++ * then, at this point, bfqq either has been ++ * added to the current burst or has caused the ++ * current burst to terminate. In particular, in ++ * the second case, bfqq has become the first ++ * queue in a possible new burst. ++ * In both cases last_ins_in_burst needs to be ++ * moved forward. ++ */ ++ if (!already_in_burst) ++ bfqd->last_ins_in_burst = jiffies; ++ } ++ ++ in_burst = bfq_bfqq_in_large_burst(bfqq); ++ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && ++ !in_burst && ++ time_is_before_jiffies(bfqq->soft_rt_next_start); ++ interactive = !in_burst && idle_for_long_time; ++ entity->budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ ++ if (!bfq_bfqq_IO_bound(bfqq)) { ++ if (time_before(jiffies, ++ RQ_BIC(rq)->ttime.last_end_request + ++ bfqd->bfq_slice_idle)) { ++ bfqq->requests_within_timer++; ++ if (bfqq->requests_within_timer >= ++ bfqd->bfq_requests_within_timer) ++ bfq_mark_bfqq_IO_bound(bfqq); ++ } else ++ bfqq->requests_within_timer = 0; ++ } ++ ++ if (!bfqd->low_latency) ++ goto add_bfqq_busy; ++ ++ /* ++ * If the queue: ++ * - is not being boosted, ++ * - has been idle for enough time, ++ * - is not a sync queue or is linked to a bfq_io_cq (it is ++ * shared "for its nature" or it is not shared and its ++ * requests have not been redirected to a shared queue) ++ * start a weight-raising period. ++ */ ++ if (old_wr_coeff == 1 && (interactive || soft_rt) && ++ (!bfq_bfqq_sync(bfqq) || bfqq->bic)) { ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ if (interactive) ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ else ++ bfqq->wr_cur_max_time = ++ bfqd->bfq_wr_rt_max_time; ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais starting at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } else if (old_wr_coeff > 1) { ++ if (interactive) ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ else if (in_burst || ++ (bfqq->wr_cur_max_time == ++ bfqd->bfq_wr_rt_max_time && ++ !soft_rt)) { ++ bfqq->wr_coeff = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais ending at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq-> ++ wr_cur_max_time)); ++ } else if (time_before( ++ bfqq->last_wr_start_finish + ++ bfqq->wr_cur_max_time, ++ jiffies + ++ bfqd->bfq_wr_rt_max_time) && ++ soft_rt) { ++ /* ++ * ++ * The remaining weight-raising time is lower ++ * than bfqd->bfq_wr_rt_max_time, which means ++ * that the application is enjoying weight ++ * raising either because deemed soft-rt in ++ * the near past, or because deemed interactive ++ * a long ago. ++ * In both cases, resetting now the current ++ * remaining weight-raising time for the ++ * application to the weight-raising duration ++ * for soft rt applications would not cause any ++ * latency increase for the application (as the ++ * new duration would be higher than the ++ * remaining time). ++ * ++ * In addition, the application is now meeting ++ * the requirements for being deemed soft rt. ++ * In the end we can correctly and safely ++ * (re)charge the weight-raising duration for ++ * the application with the weight-raising ++ * duration for soft rt applications. ++ * ++ * In particular, doing this recharge now, i.e., ++ * before the weight-raising period for the ++ * application finishes, reduces the probability ++ * of the following negative scenario: ++ * 1) the weight of a soft rt application is ++ * raised at startup (as for any newly ++ * created application), ++ * 2) since the application is not interactive, ++ * at a certain time weight-raising is ++ * stopped for the application, ++ * 3) at that time the application happens to ++ * still have pending requests, and hence ++ * is destined to not have a chance to be ++ * deemed soft rt before these requests are ++ * completed (see the comments to the ++ * function bfq_bfqq_softrt_next_start() ++ * for details on soft rt detection), ++ * 4) these pending requests experience a high ++ * latency because the application is not ++ * weight-raised while they are pending. ++ */ ++ bfqq->last_wr_start_finish = jiffies; ++ bfqq->wr_cur_max_time = ++ bfqd->bfq_wr_rt_max_time; ++ } ++ } ++ if (old_wr_coeff != bfqq->wr_coeff) ++ entity->prio_changed = 1; ++add_bfqq_busy: ++ bfqq->last_idle_bklogged = jiffies; ++ bfqq->service_from_backlogged = 0; ++ bfq_clear_bfqq_softrt_update(bfqq); ++ bfq_add_bfqq_busy(bfqd, bfqq); ++ } else { ++ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && ++ time_is_before_jiffies( ++ bfqq->last_wr_start_finish + ++ bfqd->bfq_wr_min_inter_arr_async)) { ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ ++ bfqd->wr_busy_queues++; ++ entity->prio_changed = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "non-idle wrais starting at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ if (prev != bfqq->next_rq) ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ if (bfqd->low_latency && ++ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) ++ bfqq->last_wr_start_finish = jiffies; ++} ++ ++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, ++ struct bio *bio) ++{ ++ struct task_struct *tsk = current; ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq; ++ ++ bic = bfq_bic_lookup(bfqd, tsk->io_context); ++ if (!bic) ++ return NULL; ++ ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); ++ if (bfqq) ++ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); ++ ++ return NULL; ++} ++ ++static void bfq_activate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ ++ bfqd->rq_in_driver++; ++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", ++ (long long unsigned)bfqd->last_position); ++} ++ ++static void bfq_deactivate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ ++ BUG_ON(bfqd->rq_in_driver == 0); ++ bfqd->rq_in_driver--; ++} ++ ++static void bfq_remove_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ const int sync = rq_is_sync(rq); ++ ++ if (bfqq->next_rq == rq) { ++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ if (rq->queuelist.prev != &rq->queuelist) ++ list_del_init(&rq->queuelist); ++ BUG_ON(bfqq->queued[sync] == 0); ++ bfqq->queued[sync]--; ++ bfqd->queued--; ++ elv_rb_del(&bfqq->sort_list, rq); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) ++ bfq_del_bfqq_busy(bfqd, bfqq, 1); ++ /* ++ * Remove queue from request-position tree as it is empty. ++ */ ++ if (bfqq->pos_root) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ } ++ ++ if (rq->cmd_flags & REQ_META) { ++ BUG_ON(bfqq->meta_pending == 0); ++ bfqq->meta_pending--; ++ } ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); ++#endif ++} ++ ++static int bfq_merge(struct request_queue *q, struct request **req, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct request *__rq; ++ ++ __rq = bfq_find_rq_fmerge(bfqd, bio); ++ if (__rq && elv_rq_merge_ok(__rq, bio)) { ++ *req = __rq; ++ return ELEVATOR_FRONT_MERGE; ++ } ++ ++ return ELEVATOR_NO_MERGE; ++} ++ ++static void bfq_merged_request(struct request_queue *q, struct request *req, ++ int type) ++{ ++ if (type == ELEVATOR_FRONT_MERGE && ++ rb_prev(&req->rb_node) && ++ blk_rq_pos(req) < ++ blk_rq_pos(container_of(rb_prev(&req->rb_node), ++ struct request, rb_node))) { ++ struct bfq_queue *bfqq = RQ_BFQQ(req); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ struct request *prev, *next_rq; ++ ++ /* Reposition request in its sort_list */ ++ elv_rb_del(&bfqq->sort_list, req); ++ elv_rb_add(&bfqq->sort_list, req); ++ /* Choose next request to be served for bfqq */ ++ prev = bfqq->next_rq; ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, ++ bfqd->last_position); ++ BUG_ON(!next_rq); ++ bfqq->next_rq = next_rq; ++ } ++} ++ ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++static void bfq_bio_merged(struct request_queue *q, struct request *req, ++ struct bio *bio) ++{ ++ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw); ++} ++#endif ++ ++static void bfq_merged_requests(struct request_queue *q, struct request *rq, ++ struct request *next) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); ++ ++ /* ++ * If next and rq belong to the same bfq_queue and next is older ++ * than rq, then reposition rq in the fifo (by substituting next ++ * with rq). Otherwise, if next and rq belong to different ++ * bfq_queues, never reposition rq: in fact, we would have to ++ * reposition it with respect to next's position in its own fifo, ++ * which would most certainly be too expensive with respect to ++ * the benefits. ++ */ ++ if (bfqq == next_bfqq && ++ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && ++ time_before(next->fifo_time, rq->fifo_time)) { ++ list_del_init(&rq->queuelist); ++ list_replace_init(&next->queuelist, &rq->queuelist); ++ rq->fifo_time = next->fifo_time; ++ } ++ ++ if (bfqq->next_rq == next) ++ bfqq->next_rq = rq; ++ ++ bfq_remove_request(next); ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); ++#endif ++} ++ ++/* Must be called with bfqq != NULL */ ++static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) ++{ ++ BUG_ON(!bfqq); ++ if (bfq_bfqq_busy(bfqq)) ++ bfqq->bfqd->wr_busy_queues--; ++ bfqq->wr_coeff = 1; ++ bfqq->wr_cur_max_time = 0; ++ /* Trigger a weight change on the next activation of the queue */ ++ bfqq->entity.prio_changed = 1; ++} ++ ++static void bfq_end_wr_async_queues(struct bfq_data *bfqd, ++ struct bfq_group *bfqg) ++{ ++ int i, j; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_BE_NR; j++) ++ if (bfqg->async_bfqq[i][j]) ++ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); ++ if (bfqg->async_idle_bfqq) ++ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); ++} ++ ++static void bfq_end_wr(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ ++ spin_lock_irq(bfqd->queue->queue_lock); ++ ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) ++ bfq_bfqq_end_wr(bfqq); ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) ++ bfq_bfqq_end_wr(bfqq); ++ bfq_end_wr_async(bfqd); ++ ++ spin_unlock_irq(bfqd->queue->queue_lock); ++} ++ ++static int bfq_allow_merge(struct request_queue *q, struct request *rq, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_io_cq *bic; ++ ++ /* ++ * Disallow merge of a sync bio into an async request. ++ */ ++ if (bfq_bio_sync(bio) && !rq_is_sync(rq)) ++ return 0; ++ ++ /* ++ * Lookup the bfqq that this bio will be queued with. Allow ++ * merge only if rq is queued there. ++ * Queue lock is held here. ++ */ ++ bic = bfq_bic_lookup(bfqd, current->io_context); ++ if (!bic) ++ return 0; ++ ++ return bic_to_bfqq(bic, bfq_bio_sync(bio)) == RQ_BFQQ(rq); ++} ++ ++static void __bfq_set_in_service_queue(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ if (bfqq) { ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); ++#endif ++ bfq_mark_bfqq_must_alloc(bfqq); ++ bfq_mark_bfqq_budget_new(bfqq); ++ bfq_clear_bfqq_fifo_expire(bfqq); ++ ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "set_in_service_queue, cur-budget = %d", ++ bfqq->entity.budget); ++ } ++ ++ bfqd->in_service_queue = bfqq; ++} ++ ++/* ++ * Get and set a new queue for service. ++ */ ++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); ++ ++ __bfq_set_in_service_queue(bfqd, bfqq); ++ return bfqq; ++} ++ ++/* ++ * If enough samples have been computed, return the current max budget ++ * stored in bfqd, which is dynamically updated according to the ++ * estimated disk peak rate; otherwise return the default max budget ++ */ ++static int bfq_max_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) ++ return bfq_default_max_budget; ++ else ++ return bfqd->bfq_max_budget; ++} ++ ++/* ++ * Return min budget, which is a fraction of the current or default ++ * max budget (trying with 1/32) ++ */ ++static int bfq_min_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) ++ return bfq_default_max_budget / 32; ++ else ++ return bfqd->bfq_max_budget / 32; ++} ++ ++static void bfq_arm_slice_timer(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfqd->in_service_queue; ++ struct bfq_io_cq *bic; ++ unsigned long sl; ++ ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ /* Processes have exited, don't wait. */ ++ bic = bfqd->in_service_bic; ++ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) ++ return; ++ ++ bfq_mark_bfqq_wait_request(bfqq); ++ ++ /* ++ * We don't want to idle for seeks, but we do want to allow ++ * fair distribution of slice time for a process doing back-to-back ++ * seeks. So allow a little bit of time for him to submit a new rq. ++ * ++ * To prevent processes with (partly) seeky workloads from ++ * being too ill-treated, grant them a small fraction of the ++ * assigned budget before reducing the waiting time to ++ * BFQ_MIN_TT. This happened to help reduce latency. ++ */ ++ sl = bfqd->bfq_slice_idle; ++ /* ++ * Unless the queue is being weight-raised or the scenario is ++ * asymmetric, grant only minimum idle time if the queue either ++ * has been seeky for long enough or has already proved to be ++ * constantly seeky. ++ */ ++ if (bfq_sample_valid(bfqq->seek_samples) && ++ ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > ++ bfq_max_budget(bfqq->bfqd) / 8) || ++ bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && ++ bfq_symmetric_scenario(bfqd)) ++ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); ++ else if (bfqq->wr_coeff > 1) ++ sl = sl * 3; ++ bfqd->last_idling_start = ktime_get(); ++ mod_timer(&bfqd->idle_slice_timer, jiffies + sl); ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); ++#endif ++ bfq_log(bfqd, "arm idle: %u/%u ms", ++ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); ++} ++ ++/* ++ * Set the maximum time for the in-service queue to consume its ++ * budget. This prevents seeky processes from lowering the disk ++ * throughput (always guaranteed with a time slice scheme as in CFQ). ++ */ ++static void bfq_set_budget_timeout(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfqd->in_service_queue; ++ unsigned int timeout_coeff; ++ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) ++ timeout_coeff = 1; ++ else ++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; ++ ++ bfqd->last_budget_start = ktime_get(); ++ ++ bfq_clear_bfqq_budget_new(bfqq); ++ bfqq->budget_timeout = jiffies + ++ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; ++ ++ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", ++ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * ++ timeout_coeff)); ++} ++ ++/* ++ * Move request from internal lists to the request queue dispatch list. ++ */ ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ /* ++ * For consistency, the next instruction should have been executed ++ * after removing the request from the queue and dispatching it. ++ * We execute instead this instruction before bfq_remove_request() ++ * (and hence introduce a temporary inconsistency), for efficiency. ++ * In fact, in a forced_dispatch, this prevents two counters related ++ * to bfqq->dispatched to risk to be uselessly decremented if bfqq ++ * is not in service, and then to be incremented again after ++ * incrementing bfqq->dispatched. ++ */ ++ bfqq->dispatched++; ++ bfq_remove_request(rq); ++ elv_dispatch_sort(q, rq); ++ ++ if (bfq_bfqq_sync(bfqq)) ++ bfqd->sync_flight++; ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq), ++ rq->cmd_flags); ++#endif ++} ++ ++/* ++ * Return expired entry, or NULL to just start from scratch in rbtree. ++ */ ++static struct request *bfq_check_fifo(struct bfq_queue *bfqq) ++{ ++ struct request *rq = NULL; ++ ++ if (bfq_bfqq_fifo_expire(bfqq)) ++ return NULL; ++ ++ bfq_mark_bfqq_fifo_expire(bfqq); ++ ++ if (list_empty(&bfqq->fifo)) ++ return NULL; ++ ++ rq = rq_entry_fifo(bfqq->fifo.next); ++ ++ if (time_before(jiffies, rq->fifo_time)) ++ return NULL; ++ ++ return rq; ++} ++ ++static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ return entity->budget - entity->service; ++} ++ ++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ __bfq_bfqd_reset_in_service(bfqd); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ /* ++ * Overloading budget_timeout field to store the time ++ * at which the queue remains with no backlog; used by ++ * the weight-raising mechanism. ++ */ ++ bfqq->budget_timeout = jiffies; ++ bfq_del_bfqq_busy(bfqd, bfqq, 1); ++ } else ++ bfq_activate_bfqq(bfqd, bfqq); ++} ++ ++/** ++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. ++ * @bfqd: device data. ++ * @bfqq: queue to update. ++ * @reason: reason for expiration. ++ * ++ * Handle the feedback on @bfqq budget at queue expiration. ++ * See the body for detailed comments. ++ */ ++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ enum bfqq_expiration reason) ++{ ++ struct request *next_rq; ++ int budget, min_budget; ++ ++ budget = bfqq->max_budget; ++ min_budget = bfq_min_budget(bfqd); ++ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", ++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", ++ budget, bfq_min_budget(bfqd)); ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", ++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); ++ ++ if (bfq_bfqq_sync(bfqq)) { ++ switch (reason) { ++ /* ++ * Caveat: in all the following cases we trade latency ++ * for throughput. ++ */ ++ case BFQ_BFQQ_TOO_IDLE: ++ /* ++ * This is the only case where we may reduce ++ * the budget: if there is no request of the ++ * process still waiting for completion, then ++ * we assume (tentatively) that the timer has ++ * expired because the batch of requests of ++ * the process could have been served with a ++ * smaller budget. Hence, betting that ++ * process will behave in the same way when it ++ * becomes backlogged again, we reduce its ++ * next budget. As long as we guess right, ++ * this budget cut reduces the latency ++ * experienced by the process. ++ * ++ * However, if there are still outstanding ++ * requests, then the process may have not yet ++ * issued its next request just because it is ++ * still waiting for the completion of some of ++ * the still outstanding ones. So in this ++ * subcase we do not reduce its budget, on the ++ * contrary we increase it to possibly boost ++ * the throughput, as discussed in the ++ * comments to the BUDGET_TIMEOUT case. ++ */ ++ if (bfqq->dispatched > 0) /* still outstanding reqs */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ else { ++ if (budget > 5 * min_budget) ++ budget -= 4 * min_budget; ++ else ++ budget = min_budget; ++ } ++ break; ++ case BFQ_BFQQ_BUDGET_TIMEOUT: ++ /* ++ * We double the budget here because: 1) it ++ * gives the chance to boost the throughput if ++ * this is not a seeky process (which may have ++ * bumped into this timeout because of, e.g., ++ * ZBR), 2) together with charge_full_budget ++ * it helps give seeky processes higher ++ * timestamps, and hence be served less ++ * frequently. ++ */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_BUDGET_EXHAUSTED: ++ /* ++ * The process still has backlog, and did not ++ * let either the budget timeout or the disk ++ * idling timeout expire. Hence it is not ++ * seeky, has a short thinktime and may be ++ * happy with a higher budget too. So ++ * definitely increase the budget of this good ++ * candidate to boost the disk throughput. ++ */ ++ budget = min(budget * 4, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_NO_MORE_REQUESTS: ++ /* ++ * Leave the budget unchanged. ++ */ ++ default: ++ return; ++ } ++ } else ++ /* ++ * Async queues get always the maximum possible budget ++ * (their ability to dispatch is limited by ++ * @bfqd->bfq_max_budget_async_rq). ++ */ ++ budget = bfqd->bfq_max_budget; ++ ++ bfqq->max_budget = budget; ++ ++ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && ++ !bfqd->bfq_user_max_budget) ++ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); ++ ++ /* ++ * Make sure that we have enough budget for the next request. ++ * Since the finish time of the bfqq must be kept in sync with ++ * the budget, be sure to call __bfq_bfqq_expire() after the ++ * update. ++ */ ++ next_rq = bfqq->next_rq; ++ if (next_rq) ++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ else ++ bfqq->entity.budget = bfqq->max_budget; ++ ++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", ++ next_rq ? blk_rq_sectors(next_rq) : 0, ++ bfqq->entity.budget); ++} ++ ++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) ++{ ++ unsigned long max_budget; ++ ++ /* ++ * The max_budget calculated when autotuning is equal to the ++ * amount of sectors transfered in timeout_sync at the ++ * estimated peak rate. ++ */ ++ max_budget = (unsigned long)(peak_rate * 1000 * ++ timeout >> BFQ_RATE_SHIFT); ++ ++ return max_budget; ++} ++ ++/* ++ * In addition to updating the peak rate, checks whether the process ++ * is "slow", and returns 1 if so. This slow flag is used, in addition ++ * to the budget timeout, to reduce the amount of service provided to ++ * seeky processes, and hence reduce their chances to lower the ++ * throughput. See the code for more details. ++ */ ++static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ bool compensate, enum bfqq_expiration reason) ++{ ++ u64 bw, usecs, expected, timeout; ++ ktime_t delta; ++ int update = 0; ++ ++ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) ++ return false; ++ ++ if (compensate) ++ delta = bfqd->last_idling_start; ++ else ++ delta = ktime_get(); ++ delta = ktime_sub(delta, bfqd->last_budget_start); ++ usecs = ktime_to_us(delta); ++ ++ /* Don't trust short/unrealistic values. */ ++ if (usecs < 100 || usecs >= LONG_MAX) ++ return false; ++ ++ /* ++ * Calculate the bandwidth for the last slice. We use a 64 bit ++ * value to store the peak rate, in sectors per usec in fixed ++ * point math. We do so to have enough precision in the estimate ++ * and to avoid overflows. ++ */ ++ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; ++ do_div(bw, (unsigned long)usecs); ++ ++ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); ++ ++ /* ++ * Use only long (> 20ms) intervals to filter out spikes for ++ * the peak rate estimation. ++ */ ++ if (usecs > 20000) { ++ if (bw > bfqd->peak_rate || ++ (!BFQQ_SEEKY(bfqq) && ++ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { ++ bfq_log(bfqd, "measured bw =%llu", bw); ++ /* ++ * To smooth oscillations use a low-pass filter with ++ * alpha=7/8, i.e., ++ * new_rate = (7/8) * old_rate + (1/8) * bw ++ */ ++ do_div(bw, 8); ++ if (bw == 0) ++ return 0; ++ bfqd->peak_rate *= 7; ++ do_div(bfqd->peak_rate, 8); ++ bfqd->peak_rate += bw; ++ update = 1; ++ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); ++ } ++ ++ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; ++ ++ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) ++ bfqd->peak_rate_samples++; ++ ++ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && ++ update) { ++ int dev_type = blk_queue_nonrot(bfqd->queue); ++ if (bfqd->bfq_user_max_budget == 0) { ++ bfqd->bfq_max_budget = ++ bfq_calc_max_budget(bfqd->peak_rate, ++ timeout); ++ bfq_log(bfqd, "new max_budget=%d", ++ bfqd->bfq_max_budget); ++ } ++ if (bfqd->device_speed == BFQ_BFQD_FAST && ++ bfqd->peak_rate < device_speed_thresh[dev_type]) { ++ bfqd->device_speed = BFQ_BFQD_SLOW; ++ bfqd->RT_prod = R_slow[dev_type] * ++ T_slow[dev_type]; ++ } else if (bfqd->device_speed == BFQ_BFQD_SLOW && ++ bfqd->peak_rate > device_speed_thresh[dev_type]) { ++ bfqd->device_speed = BFQ_BFQD_FAST; ++ bfqd->RT_prod = R_fast[dev_type] * ++ T_fast[dev_type]; ++ } ++ } ++ } ++ ++ /* ++ * If the process has been served for a too short time ++ * interval to let its possible sequential accesses prevail on ++ * the initial seek time needed to move the disk head on the ++ * first sector it requested, then give the process a chance ++ * and for the moment return false. ++ */ ++ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) ++ return false; ++ ++ /* ++ * A process is considered ``slow'' (i.e., seeky, so that we ++ * cannot treat it fairly in the service domain, as it would ++ * slow down too much the other processes) if, when a slice ++ * ends for whatever reason, it has received service at a ++ * rate that would not be high enough to complete the budget ++ * before the budget timeout expiration. ++ */ ++ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; ++ ++ /* ++ * Caveat: processes doing IO in the slower disk zones will ++ * tend to be slow(er) even if not seeky. And the estimated ++ * peak rate will actually be an average over the disk ++ * surface. Hence, to not be too harsh with unlucky processes, ++ * we keep a budget/3 margin of safety before declaring a ++ * process slow. ++ */ ++ return expected > (4 * bfqq->entity.budget) / 3; ++} ++ ++/* ++ * To be deemed as soft real-time, an application must meet two ++ * requirements. First, the application must not require an average ++ * bandwidth higher than the approximate bandwidth required to playback or ++ * record a compressed high-definition video. ++ * The next function is invoked on the completion of the last request of a ++ * batch, to compute the next-start time instant, soft_rt_next_start, such ++ * that, if the next request of the application does not arrive before ++ * soft_rt_next_start, then the above requirement on the bandwidth is met. ++ * ++ * The second requirement is that the request pattern of the application is ++ * isochronous, i.e., that, after issuing a request or a batch of requests, ++ * the application stops issuing new requests until all its pending requests ++ * have been completed. After that, the application may issue a new batch, ++ * and so on. ++ * For this reason the next function is invoked to compute ++ * soft_rt_next_start only for applications that meet this requirement, ++ * whereas soft_rt_next_start is set to infinity for applications that do ++ * not. ++ * ++ * Unfortunately, even a greedy application may happen to behave in an ++ * isochronous way if the CPU load is high. In fact, the application may ++ * stop issuing requests while the CPUs are busy serving other processes, ++ * then restart, then stop again for a while, and so on. In addition, if ++ * the disk achieves a low enough throughput with the request pattern ++ * issued by the application (e.g., because the request pattern is random ++ * and/or the device is slow), then the application may meet the above ++ * bandwidth requirement too. To prevent such a greedy application to be ++ * deemed as soft real-time, a further rule is used in the computation of ++ * soft_rt_next_start: soft_rt_next_start must be higher than the current ++ * time plus the maximum time for which the arrival of a request is waited ++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. ++ * This filters out greedy applications, as the latter issue instead their ++ * next request as soon as possible after the last one has been completed ++ * (in contrast, when a batch of requests is completed, a soft real-time ++ * application spends some time processing data). ++ * ++ * Unfortunately, the last filter may easily generate false positives if ++ * only bfqd->bfq_slice_idle is used as a reference time interval and one ++ * or both the following cases occur: ++ * 1) HZ is so low that the duration of a jiffy is comparable to or higher ++ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with ++ * HZ=100. ++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing ++ * for a while, then suddenly 'jump' by several units to recover the lost ++ * increments. This seems to happen, e.g., inside virtual machines. ++ * To address this issue, we do not use as a reference time interval just ++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In ++ * particular we add the minimum number of jiffies for which the filter ++ * seems to be quite precise also in embedded systems and KVM/QEMU virtual ++ * machines. ++ */ ++static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ return max(bfqq->last_idle_bklogged + ++ HZ * bfqq->service_from_backlogged / ++ bfqd->bfq_wr_max_softrt_rate, ++ jiffies + bfqq->bfqd->bfq_slice_idle + 4); ++} ++ ++/* ++ * Return the largest-possible time instant such that, for as long as possible, ++ * the current time will be lower than this time instant according to the macro ++ * time_is_before_jiffies(). ++ */ ++static unsigned long bfq_infinity_from_now(unsigned long now) ++{ ++ return now + ULONG_MAX / 2; ++} ++ ++/** ++ * bfq_bfqq_expire - expire a queue. ++ * @bfqd: device owning the queue. ++ * @bfqq: the queue to expire. ++ * @compensate: if true, compensate for the time spent idling. ++ * @reason: the reason causing the expiration. ++ * ++ * ++ * If the process associated to the queue is slow (i.e., seeky), or in ++ * case of budget timeout, or, finally, if it is async, we ++ * artificially charge it an entire budget (independently of the ++ * actual service it received). As a consequence, the queue will get ++ * higher timestamps than the correct ones upon reactivation, and ++ * hence it will be rescheduled as if it had received more service ++ * than what it actually received. In the end, this class of processes ++ * will receive less service in proportion to how slowly they consume ++ * their budgets (and hence how seriously they tend to lower the ++ * throughput). ++ * ++ * In contrast, when a queue expires because it has been idling for ++ * too much or because it exhausted its budget, we do not touch the ++ * amount of service it has received. Hence when the queue will be ++ * reactivated and its timestamps updated, the latter will be in sync ++ * with the actual service received by the queue until expiration. ++ * ++ * Charging a full budget to the first type of queues and the exact ++ * service to the others has the effect of using the WF2Q+ policy to ++ * schedule the former on a timeslice basis, without violating the ++ * service domain guarantees of the latter. ++ */ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool compensate, ++ enum bfqq_expiration reason) ++{ ++ bool slow; ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ /* ++ * Update disk peak rate for autotuning and check whether the ++ * process is slow (see bfq_update_peak_rate). ++ */ ++ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); ++ ++ /* ++ * As above explained, 'punish' slow (i.e., seeky), timed-out ++ * and async queues, to favor sequential sync workloads. ++ * ++ * Processes doing I/O in the slower disk zones will tend to be ++ * slow(er) even if not seeky. Hence, since the estimated peak ++ * rate is actually an average over the disk surface, these ++ * processes may timeout just for bad luck. To avoid punishing ++ * them we do not charge a full budget to a process that ++ * succeeded in consuming at least 2/3 of its budget. ++ */ ++ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) ++ bfq_bfqq_charge_full_budget(bfqq); ++ ++ bfqq->service_from_backlogged += bfqq->entity.service; ++ ++ if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && ++ !bfq_bfqq_constantly_seeky(bfqq)) { ++ bfq_mark_bfqq_constantly_seeky(bfqq); ++ if (!blk_queue_nonrot(bfqd->queue)) ++ bfqd->const_seeky_busy_in_flight_queues++; ++ } ++ ++ if (reason == BFQ_BFQQ_TOO_IDLE && ++ bfqq->entity.service <= 2 * bfqq->entity.budget / 10 ) ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ if (bfqd->low_latency && bfqq->wr_coeff == 1) ++ bfqq->last_wr_start_finish = jiffies; ++ ++ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && ++ RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ /* ++ * If we get here, and there are no outstanding requests, ++ * then the request pattern is isochronous (see the comments ++ * to the function bfq_bfqq_softrt_next_start()). Hence we ++ * can compute soft_rt_next_start. If, instead, the queue ++ * still has outstanding requests, then we have to wait ++ * for the completion of all the outstanding requests to ++ * discover whether the request pattern is actually ++ * isochronous. ++ */ ++ if (bfqq->dispatched == 0) ++ bfqq->soft_rt_next_start = ++ bfq_bfqq_softrt_next_start(bfqd, bfqq); ++ else { ++ /* ++ * The application is still waiting for the ++ * completion of one or more requests: ++ * prevent it from possibly being incorrectly ++ * deemed as soft real-time by setting its ++ * soft_rt_next_start to infinity. In fact, ++ * without this assignment, the application ++ * would be incorrectly deemed as soft ++ * real-time if: ++ * 1) it issued a new request before the ++ * completion of all its in-flight ++ * requests, and ++ * 2) at that time, its soft_rt_next_start ++ * happened to be in the past. ++ */ ++ bfqq->soft_rt_next_start = ++ bfq_infinity_from_now(jiffies); ++ /* ++ * Schedule an update of soft_rt_next_start to when ++ * the task may be discovered to be isochronous. ++ */ ++ bfq_mark_bfqq_softrt_update(bfqq); ++ } ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, ++ slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); ++ ++ /* ++ * Increase, decrease or leave budget unchanged according to ++ * reason. ++ */ ++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); ++ __bfq_bfqq_expire(bfqd, bfqq); ++} ++ ++/* ++ * Budget timeout is not implemented through a dedicated timer, but ++ * just checked on request arrivals and completions, as well as on ++ * idle timer expirations. ++ */ ++static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) ++{ ++ if (bfq_bfqq_budget_new(bfqq) || ++ time_before(jiffies, bfqq->budget_timeout)) ++ return false; ++ return true; ++} ++ ++/* ++ * If we expire a queue that is waiting for the arrival of a new ++ * request, we may prevent the fictitious timestamp back-shifting that ++ * allows the guarantees of the queue to be preserved (see [1] for ++ * this tricky aspect). Hence we return true only if this condition ++ * does not hold, or if the queue is slow enough to deserve only to be ++ * kicked off for preserving a high throughput. ++*/ ++static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "may_budget_timeout: wait_request %d left %d timeout %d", ++ bfq_bfqq_wait_request(bfqq), ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, ++ bfq_bfqq_budget_timeout(bfqq)); ++ ++ return (!bfq_bfqq_wait_request(bfqq) || ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) ++ && ++ bfq_bfqq_budget_timeout(bfqq); ++} ++ ++/* ++ * For a queue that becomes empty, device idling is allowed only if ++ * this function returns true for that queue. As a consequence, since ++ * device idling plays a critical role for both throughput boosting ++ * and service guarantees, the return value of this function plays a ++ * critical role as well. ++ * ++ * In a nutshell, this function returns true only if idling is ++ * beneficial for throughput or, even if detrimental for throughput, ++ * idling is however necessary to preserve service guarantees (low ++ * latency, desired throughput distribution, ...). In particular, on ++ * NCQ-capable devices, this function tries to return false, so as to ++ * help keep the drives' internal queues full, whenever this helps the ++ * device boost the throughput without causing any service-guarantee ++ * issue. ++ * ++ * In more detail, the return value of this function is obtained by, ++ * first, computing a number of boolean variables that take into ++ * account throughput and service-guarantee issues, and, then, ++ * combining these variables in a logical expression. Most of the ++ * issues taken into account are not trivial. We discuss these issues ++ * while introducing the variables. ++ */ ++static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) ++{ ++ struct bfq_data *bfqd = bfqq->bfqd; ++ bool idling_boosts_thr, idling_boosts_thr_without_issues, ++ all_queues_seeky, on_hdd_and_not_all_queues_seeky, ++ idling_needed_for_service_guarantees, ++ asymmetric_scenario; ++ ++ /* ++ * The next variable takes into account the cases where idling ++ * boosts the throughput. ++ * ++ * The value of the variable is computed considering, first, that ++ * idling is virtually always beneficial for the throughput if: ++ * (a) the device is not NCQ-capable, or ++ * (b) regardless of the presence of NCQ, the device is rotational ++ * and the request pattern for bfqq is I/O-bound and sequential. ++ * ++ * Secondly, and in contrast to the above item (b), idling an ++ * NCQ-capable flash-based device would not boost the ++ * throughput even with sequential I/O; rather it would lower ++ * the throughput in proportion to how fast the device ++ * is. Accordingly, the next variable is true if any of the ++ * above conditions (a) and (b) is true, and, in particular, ++ * happens to be false if bfqd is an NCQ-capable flash-based ++ * device. ++ */ ++ idling_boosts_thr = !bfqd->hw_tag || ++ (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && ++ bfq_bfqq_idle_window(bfqq)) ; ++ ++ /* ++ * The value of the next variable, ++ * idling_boosts_thr_without_issues, is equal to that of ++ * idling_boosts_thr, unless a special case holds. In this ++ * special case, described below, idling may cause problems to ++ * weight-raised queues. ++ * ++ * When the request pool is saturated (e.g., in the presence ++ * of write hogs), if the processes associated with ++ * non-weight-raised queues ask for requests at a lower rate, ++ * then processes associated with weight-raised queues have a ++ * higher probability to get a request from the pool ++ * immediately (or at least soon) when they need one. Thus ++ * they have a higher probability to actually get a fraction ++ * of the device throughput proportional to their high ++ * weight. This is especially true with NCQ-capable drives, ++ * which enqueue several requests in advance, and further ++ * reorder internally-queued requests. ++ * ++ * For this reason, we force to false the value of ++ * idling_boosts_thr_without_issues if there are weight-raised ++ * busy queues. In this case, and if bfqq is not weight-raised, ++ * this guarantees that the device is not idled for bfqq (if, ++ * instead, bfqq is weight-raised, then idling will be ++ * guaranteed by another variable, see below). Combined with ++ * the timestamping rules of BFQ (see [1] for details), this ++ * behavior causes bfqq, and hence any sync non-weight-raised ++ * queue, to get a lower number of requests served, and thus ++ * to ask for a lower number of requests from the request ++ * pool, before the busy weight-raised queues get served ++ * again. This often mitigates starvation problems in the ++ * presence of heavy write workloads and NCQ, thereby ++ * guaranteeing a higher application and system responsiveness ++ * in these hostile scenarios. ++ */ ++ idling_boosts_thr_without_issues = idling_boosts_thr && ++ bfqd->wr_busy_queues == 0; ++ ++ /* ++ * There are then two cases where idling must be performed not ++ * for throughput concerns, but to preserve service ++ * guarantees. In the description of these cases, we say, for ++ * short, that a queue is sequential/random if the process ++ * associated to the queue issues sequential/random requests ++ * (in the second case the queue may be tagged as seeky or ++ * even constantly_seeky). ++ * ++ * To introduce the first case, we note that, since ++ * bfq_bfqq_idle_window(bfqq) is false if the device is ++ * NCQ-capable and bfqq is random (see ++ * bfq_update_idle_window()), then, from the above two ++ * assignments it follows that ++ * idling_boosts_thr_without_issues is false if the device is ++ * NCQ-capable and bfqq is random. Therefore, for this case, ++ * device idling would never be allowed if we used just ++ * idling_boosts_thr_without_issues to decide whether to allow ++ * it. And, beneficially, this would imply that throughput ++ * would always be boosted also with random I/O on NCQ-capable ++ * HDDs. ++ * ++ * But we must be careful on this point, to avoid an unfair ++ * treatment for bfqq. In fact, because of the same above ++ * assignments, idling_boosts_thr_without_issues is, on the ++ * other hand, true if 1) the device is an HDD and bfqq is ++ * sequential, and 2) there are no busy weight-raised ++ * queues. As a consequence, if we used just ++ * idling_boosts_thr_without_issues to decide whether to idle ++ * the device, then with an HDD we might easily bump into a ++ * scenario where queues that are sequential and I/O-bound ++ * would enjoy idling, whereas random queues would not. The ++ * latter might then get a low share of the device throughput, ++ * simply because the former would get many requests served ++ * after being set as in service, while the latter would not. ++ * ++ * To address this issue, we start by setting to true a ++ * sentinel variable, on_hdd_and_not_all_queues_seeky, if the ++ * device is rotational and not all queues with pending or ++ * in-flight requests are constantly seeky (i.e., there are ++ * active sequential queues, and bfqq might then be mistreated ++ * if it does not enjoy idling because it is random). ++ */ ++ all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) && ++ bfqd->busy_in_flight_queues == ++ bfqd->const_seeky_busy_in_flight_queues; ++ ++ on_hdd_and_not_all_queues_seeky = ++ !blk_queue_nonrot(bfqd->queue) && !all_queues_seeky; ++ ++ /* ++ * To introduce the second case where idling needs to be ++ * performed to preserve service guarantees, we can note that ++ * allowing the drive to enqueue more than one request at a ++ * time, and hence delegating de facto final scheduling ++ * decisions to the drive's internal scheduler, causes loss of ++ * control on the actual request service order. In particular, ++ * the critical situation is when requests from different ++ * processes happens to be present, at the same time, in the ++ * internal queue(s) of the drive. In such a situation, the ++ * drive, by deciding the service order of the ++ * internally-queued requests, does determine also the actual ++ * throughput distribution among these processes. But the ++ * drive typically has no notion or concern about per-process ++ * throughput distribution, and makes its decisions only on a ++ * per-request basis. Therefore, the service distribution ++ * enforced by the drive's internal scheduler is likely to ++ * coincide with the desired device-throughput distribution ++ * only in a completely symmetric scenario where: ++ * (i) each of these processes must get the same throughput as ++ * the others; ++ * (ii) all these processes have the same I/O pattern ++ (either sequential or random). ++ * In fact, in such a scenario, the drive will tend to treat ++ * the requests of each of these processes in about the same ++ * way as the requests of the others, and thus to provide ++ * each of these processes with about the same throughput ++ * (which is exactly the desired throughput distribution). In ++ * contrast, in any asymmetric scenario, device idling is ++ * certainly needed to guarantee that bfqq receives its ++ * assigned fraction of the device throughput (see [1] for ++ * details). ++ * ++ * We address this issue by controlling, actually, only the ++ * symmetry sub-condition (i), i.e., provided that ++ * sub-condition (i) holds, idling is not performed, ++ * regardless of whether sub-condition (ii) holds. In other ++ * words, only if sub-condition (i) holds, then idling is ++ * allowed, and the device tends to be prevented from queueing ++ * many requests, possibly of several processes. The reason ++ * for not controlling also sub-condition (ii) is that, first, ++ * in the case of an HDD, the asymmetry in terms of types of ++ * I/O patterns is already taken in to account in the above ++ * sentinel variable ++ * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a ++ * flash-based device, we prefer however to privilege ++ * throughput (and idling lowers throughput for this type of ++ * devices), for the following reasons: ++ * 1) differently from HDDs, the service time of random ++ * requests is not orders of magnitudes lower than the service ++ * time of sequential requests; thus, even if processes doing ++ * sequential I/O get a preferential treatment with respect to ++ * others doing random I/O, the consequences are not as ++ * dramatic as with HDDs; ++ * 2) if a process doing random I/O does need strong ++ * throughput guarantees, it is hopefully already being ++ * weight-raised, or the user is likely to have assigned it a ++ * higher weight than the other processes (and thus ++ * sub-condition (i) is likely to be false, which triggers ++ * idling). ++ * ++ * According to the above considerations, the next variable is ++ * true (only) if sub-condition (i) holds. To compute the ++ * value of this variable, we not only use the return value of ++ * the function bfq_symmetric_scenario(), but also check ++ * whether bfqq is being weight-raised, because ++ * bfq_symmetric_scenario() does not take into account also ++ * weight-raised queues (see comments to ++ * bfq_weights_tree_add()). ++ * ++ * As a side note, it is worth considering that the above ++ * device-idling countermeasures may however fail in the ++ * following unlucky scenario: if idling is (correctly) ++ * disabled in a time period during which all symmetry ++ * sub-conditions hold, and hence the device is allowed to ++ * enqueue many requests, but at some later point in time some ++ * sub-condition stops to hold, then it may become impossible ++ * to let requests be served in the desired order until all ++ * the requests already queued in the device have been served. ++ */ ++ asymmetric_scenario = bfqq->wr_coeff > 1 || ++ !bfq_symmetric_scenario(bfqd); ++ ++ /* ++ * Finally, there is a case where maximizing throughput is the ++ * best choice even if it may cause unfairness toward ++ * bfqq. Such a case is when bfqq became active in a burst of ++ * queue activations. Queues that became active during a large ++ * burst benefit only from throughput, as discussed in the ++ * comments to bfq_handle_burst. Thus, if bfqq became active ++ * in a burst and not idling the device maximizes throughput, ++ * then the device must no be idled, because not idling the ++ * device provides bfqq and all other queues in the burst with ++ * maximum benefit. Combining this and the two cases above, we ++ * can now establish when idling is actually needed to ++ * preserve service guarantees. ++ */ ++ idling_needed_for_service_guarantees = ++ (on_hdd_and_not_all_queues_seeky || asymmetric_scenario) && ++ !bfq_bfqq_in_large_burst(bfqq); ++ ++ /* ++ * We have now all the components we need to compute the return ++ * value of the function, which is true only if both the following ++ * conditions hold: ++ * 1) bfqq is sync, because idling make sense only for sync queues; ++ * 2) idling either boosts the throughput (without issues), or ++ * is necessary to preserve service guarantees. ++ */ ++ return bfq_bfqq_sync(bfqq) && ++ (idling_boosts_thr_without_issues || ++ idling_needed_for_service_guarantees); ++} ++ ++/* ++ * If the in-service queue is empty but the function bfq_bfqq_may_idle ++ * returns true, then: ++ * 1) the queue must remain in service and cannot be expired, and ++ * 2) the device must be idled to wait for the possible arrival of a new ++ * request for the queue. ++ * See the comments to the function bfq_bfqq_may_idle for the reasons ++ * why performing device idling is the best choice to boost the throughput ++ * and preserve service guarantees when bfq_bfqq_may_idle itself ++ * returns true. ++ */ ++static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) ++{ ++ struct bfq_data *bfqd = bfqq->bfqd; ++ ++ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && ++ bfq_bfqq_may_idle(bfqq); ++} ++ ++/* ++ * Select a queue for service. If we have a current queue in service, ++ * check whether to continue servicing it, or retrieve and set a new one. ++ */ ++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ struct request *next_rq; ++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ ++ bfqq = bfqd->in_service_queue; ++ if (!bfqq) ++ goto new_queue; ++ ++ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); ++ ++ if (bfq_may_expire_for_budg_timeout(bfqq) && ++ !timer_pending(&bfqd->idle_slice_timer) && ++ !bfq_bfqq_must_idle(bfqq)) ++ goto expire; ++ ++ next_rq = bfqq->next_rq; ++ /* ++ * If bfqq has requests queued and it has enough budget left to ++ * serve them, keep the queue, otherwise expire it. ++ */ ++ if (next_rq) { ++ if (bfq_serv_to_charge(next_rq, bfqq) > ++ bfq_bfqq_budget_left(bfqq)) { ++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; ++ goto expire; ++ } else { ++ /* ++ * The idle timer may be pending because we may ++ * not disable disk idling even when a new request ++ * arrives. ++ */ ++ if (timer_pending(&bfqd->idle_slice_timer)) { ++ /* ++ * If we get here: 1) at least a new request ++ * has arrived but we have not disabled the ++ * timer because the request was too small, ++ * 2) then the block layer has unplugged ++ * the device, causing the dispatch to be ++ * invoked. ++ * ++ * Since the device is unplugged, now the ++ * requests are probably large enough to ++ * provide a reasonable throughput. ++ * So we disable idling. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ del_timer(&bfqd->idle_slice_timer); ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ bfqg_stats_update_idle_time(bfqq_group(bfqq)); ++#endif ++ } ++ goto keep_queue; ++ } ++ } ++ ++ /* ++ * No requests pending. However, if the in-service queue is idling ++ * for a new request, or has requests waiting for a completion and ++ * may idle after their completion, then keep it anyway. ++ */ ++ if (timer_pending(&bfqd->idle_slice_timer) || ++ (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { ++ bfqq = NULL; ++ goto keep_queue; ++ } ++ ++ reason = BFQ_BFQQ_NO_MORE_REQUESTS; ++expire: ++ bfq_bfqq_expire(bfqd, bfqq, false, reason); ++new_queue: ++ bfqq = bfq_set_in_service_queue(bfqd); ++ bfq_log(bfqd, "select_queue: new queue %d returned", ++ bfqq ? bfqq->pid : 0); ++keep_queue: ++ return bfqq; ++} ++ ++static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ ++ bfq_log_bfqq(bfqd, bfqq, ++ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqq->wr_coeff, ++ bfqq->entity.weight, bfqq->entity.orig_weight); ++ ++ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != ++ entity->orig_weight * bfqq->wr_coeff); ++ if (entity->prio_changed) ++ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); ++ ++ /* ++ * If the queue was activated in a burst, or ++ * too much time has elapsed from the beginning ++ * of this weight-raising period, then end weight ++ * raising. ++ */ ++ if (bfq_bfqq_in_large_burst(bfqq) || ++ time_is_before_jiffies(bfqq->last_wr_start_finish + ++ bfqq->wr_cur_max_time)) { ++ bfqq->last_wr_start_finish = jiffies; ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais ending at %lu, rais_max_time %u", ++ bfqq->last_wr_start_finish, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ bfq_bfqq_end_wr(bfqq); ++ } ++ } ++ /* Update weight both if it must be raised and if it must be lowered */ ++ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) ++ __bfq_entity_update_weight_prio( ++ bfq_entity_service_tree(entity), ++ entity); ++} ++ ++/* ++ * Dispatch one request from bfqq, moving it to the request queue ++ * dispatch list. ++ */ ++static int bfq_dispatch_request(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ int dispatched = 0; ++ struct request *rq; ++ unsigned long service_to_charge; ++ ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ /* Follow expired path, else get first next available. */ ++ rq = bfq_check_fifo(bfqq); ++ if (!rq) ++ rq = bfqq->next_rq; ++ service_to_charge = bfq_serv_to_charge(rq, bfqq); ++ ++ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { ++ /* ++ * This may happen if the next rq is chosen in fifo order ++ * instead of sector order. The budget is properly ++ * dimensioned to be always sufficient to serve the next ++ * request only if it is chosen in sector order. The reason ++ * is that it would be quite inefficient and little useful ++ * to always make sure that the budget is large enough to ++ * serve even the possible next rq in fifo order. ++ * In fact, requests are seldom served in fifo order. ++ * ++ * Expire the queue for budget exhaustion, and make sure ++ * that the next act_budget is enough to serve the next ++ * request, even if it comes from the fifo expired path. ++ */ ++ bfqq->next_rq = rq; ++ /* ++ * Since this dispatch is failed, make sure that ++ * a new one will be performed ++ */ ++ if (!bfqd->rq_in_driver) ++ bfq_schedule_dispatch(bfqd); ++ goto expire; ++ } ++ ++ /* Finally, insert request into driver dispatch list. */ ++ bfq_bfqq_served(bfqq, service_to_charge); ++ bfq_dispatch_insert(bfqd->queue, rq); ++ ++ bfq_update_wr_data(bfqd, bfqq); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "dispatched %u sec req (%llu), budg left %d", ++ blk_rq_sectors(rq), ++ (long long unsigned)blk_rq_pos(rq), ++ bfq_bfqq_budget_left(bfqq)); ++ ++ dispatched++; ++ ++ if (!bfqd->in_service_bic) { ++ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); ++ bfqd->in_service_bic = RQ_BIC(rq); ++ } ++ ++ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && ++ dispatched >= bfqd->bfq_max_budget_async_rq) || ++ bfq_class_idle(bfqq))) ++ goto expire; ++ ++ return dispatched; ++ ++expire: ++ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); ++ return dispatched; ++} ++ ++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) ++{ ++ int dispatched = 0; ++ ++ while (bfqq->next_rq) { ++ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); ++ dispatched++; ++ } ++ ++ BUG_ON(!list_empty(&bfqq->fifo)); ++ return dispatched; ++} ++ ++/* ++ * Drain our current requests. ++ * Used for barriers and when switching io schedulers on-the-fly. ++ */ ++static int bfq_forced_dispatch(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq, *n; ++ struct bfq_service_tree *st; ++ int dispatched = 0; ++ ++ bfqq = bfqd->in_service_queue; ++ if (bfqq) ++ __bfq_bfqq_expire(bfqd, bfqq); ++ ++ /* ++ * Loop through classes, and be careful to leave the scheduler ++ * in a consistent state, as feedback mechanisms and vtime ++ * updates cannot be disabled during the process. ++ */ ++ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { ++ st = bfq_entity_service_tree(&bfqq->entity); ++ ++ dispatched += __bfq_forced_dispatch_bfqq(bfqq); ++ bfqq->max_budget = bfq_max_budget(bfqd); ++ ++ bfq_forget_idle(st); ++ } ++ ++ BUG_ON(bfqd->busy_queues != 0); ++ ++ return dispatched; ++} ++ ++static int bfq_dispatch_requests(struct request_queue *q, int force) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq; ++ int max_dispatch; ++ ++ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); ++ if (bfqd->busy_queues == 0) ++ return 0; ++ ++ if (unlikely(force)) ++ return bfq_forced_dispatch(bfqd); ++ ++ bfqq = bfq_select_queue(bfqd); ++ if (!bfqq) ++ return 0; ++ ++ if (bfq_class_idle(bfqq)) ++ max_dispatch = 1; ++ ++ if (!bfq_bfqq_sync(bfqq)) ++ max_dispatch = bfqd->bfq_max_budget_async_rq; ++ ++ if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) { ++ if (bfqd->busy_queues > 1) ++ return 0; ++ if (bfqq->dispatched >= 4 * max_dispatch) ++ return 0; ++ } ++ ++ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) ++ return 0; ++ ++ bfq_clear_bfqq_wait_request(bfqq); ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); ++ ++ if (!bfq_dispatch_request(bfqd, bfqq)) ++ return 0; ++ ++ bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", ++ bfq_bfqq_sync(bfqq) ? "sync" : "async"); ++ ++ return 1; ++} ++ ++/* ++ * Task holds one reference to the queue, dropped when task exits. Each rq ++ * in-flight on this queue also holds a reference, dropped when rq is freed. ++ * ++ * Queue lock must be held here. ++ */ ++static void bfq_put_queue(struct bfq_queue *bfqq) ++{ ++ struct bfq_data *bfqd = bfqq->bfqd; ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ struct bfq_group *bfqg = bfqq_group(bfqq); ++#endif ++ ++ BUG_ON(atomic_read(&bfqq->ref) <= 0); ++ ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, ++ atomic_read(&bfqq->ref)); ++ if (!atomic_dec_and_test(&bfqq->ref)) ++ return; ++ ++ BUG_ON(rb_first(&bfqq->sort_list)); ++ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); ++ BUG_ON(bfqq->entity.tree); ++ BUG_ON(bfq_bfqq_busy(bfqq)); ++ BUG_ON(bfqd->in_service_queue == bfqq); ++ ++ if (bfq_bfqq_sync(bfqq)) ++ /* ++ * The fact that this queue is being destroyed does not ++ * invalidate the fact that this queue may have been ++ * activated during the current burst. As a consequence, ++ * although the queue does not exist anymore, and hence ++ * needs to be removed from the burst list if there, ++ * the burst size has not to be decremented. ++ */ ++ hlist_del_init(&bfqq->burst_list_node); ++ ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); ++ ++ kmem_cache_free(bfq_pool, bfqq); ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ bfqg_put(bfqg); ++#endif ++} ++ ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ if (bfqq == bfqd->in_service_queue) { ++ __bfq_bfqq_expire(bfqd, bfqq); ++ bfq_schedule_dispatch(bfqd); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, ++ atomic_read(&bfqq->ref)); ++ ++ bfq_put_queue(bfqq); ++} ++ ++static void bfq_init_icq(struct io_cq *icq) ++{ ++ struct bfq_io_cq *bic = icq_to_bic(icq); ++ ++ bic->ttime.last_end_request = jiffies; ++} ++ ++static void bfq_exit_icq(struct io_cq *icq) ++{ ++ struct bfq_io_cq *bic = icq_to_bic(icq); ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ ++ if (bic->bfqq[BLK_RW_ASYNC]) { ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); ++ bic->bfqq[BLK_RW_ASYNC] = NULL; ++ } ++ ++ if (bic->bfqq[BLK_RW_SYNC]) { ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); ++ bic->bfqq[BLK_RW_SYNC] = NULL; ++ } ++} ++ ++/* ++ * Update the entity prio values; note that the new values will not ++ * be used until the next (re)activation. ++ */ ++static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) ++{ ++ struct task_struct *tsk = current; ++ int ioprio_class; ++ ++ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); ++ switch (ioprio_class) { ++ default: ++ dev_err(bfqq->bfqd->queue->backing_dev_info.dev, ++ "bfq: bad prio class %d\n", ioprio_class); ++ case IOPRIO_CLASS_NONE: ++ /* ++ * No prio set, inherit CPU scheduling settings. ++ */ ++ bfqq->new_ioprio = task_nice_ioprio(tsk); ++ bfqq->new_ioprio_class = task_nice_ioclass(tsk); ++ break; ++ case IOPRIO_CLASS_RT: ++ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; ++ break; ++ case IOPRIO_CLASS_BE: ++ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; ++ break; ++ case IOPRIO_CLASS_IDLE: ++ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; ++ bfqq->new_ioprio = 7; ++ bfq_clear_bfqq_idle_window(bfqq); ++ break; ++ } ++ ++ if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) { ++ printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n", ++ bfqq->new_ioprio); ++ BUG(); ++ } ++ ++ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); ++ bfqq->entity.prio_changed = 1; ++} ++ ++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) ++{ ++ struct bfq_data *bfqd; ++ struct bfq_queue *bfqq, *new_bfqq; ++ unsigned long uninitialized_var(flags); ++ int ioprio = bic->icq.ioc->ioprio; ++ ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), ++ &flags); ++ /* ++ * This condition may trigger on a newly created bic, be sure to ++ * drop the lock before returning. ++ */ ++ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) ++ goto out; ++ ++ bic->ioprio = ioprio; ++ ++ bfqq = bic->bfqq[BLK_RW_ASYNC]; ++ if (bfqq) { ++ new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, ++ GFP_ATOMIC); ++ if (new_bfqq) { ++ bic->bfqq[BLK_RW_ASYNC] = new_bfqq; ++ bfq_log_bfqq(bfqd, bfqq, ++ "check_ioprio_change: bfqq %p %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ bfq_put_queue(bfqq); ++ } ++ } ++ ++ bfqq = bic->bfqq[BLK_RW_SYNC]; ++ if (bfqq) ++ bfq_set_next_ioprio_data(bfqq, bic); ++ ++out: ++ bfq_put_bfqd_unlock(bfqd, &flags); ++} ++ ++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic, pid_t pid, int is_sync) ++{ ++ RB_CLEAR_NODE(&bfqq->entity.rb_node); ++ INIT_LIST_HEAD(&bfqq->fifo); ++ INIT_HLIST_NODE(&bfqq->burst_list_node); ++ ++ atomic_set(&bfqq->ref, 0); ++ bfqq->bfqd = bfqd; ++ ++ if (bic) ++ bfq_set_next_ioprio_data(bfqq, bic); ++ ++ if (is_sync) { ++ if (!bfq_class_idle(bfqq)) ++ bfq_mark_bfqq_idle_window(bfqq); ++ bfq_mark_bfqq_sync(bfqq); ++ } else ++ bfq_clear_bfqq_sync(bfqq); ++ bfq_mark_bfqq_IO_bound(bfqq); ++ ++ /* Tentative initial value to trade off between thr and lat */ ++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; ++ bfqq->pid = pid; ++ ++ bfqq->wr_coeff = 1; ++ bfqq->last_wr_start_finish = 0; ++ /* ++ * Set to the value for which bfqq will not be deemed as ++ * soft rt when it becomes backlogged. ++ */ ++ bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); ++} ++ ++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, ++ struct bio *bio, int is_sync, ++ struct bfq_io_cq *bic, ++ gfp_t gfp_mask) ++{ ++ struct bfq_group *bfqg; ++ struct bfq_queue *bfqq, *new_bfqq = NULL; ++ struct blkcg *blkcg; ++ ++retry: ++ rcu_read_lock(); ++ ++ blkcg = bio_blkcg(bio); ++ bfqg = bfq_find_alloc_group(bfqd, blkcg); ++ /* bic always exists here */ ++ bfqq = bic_to_bfqq(bic, is_sync); ++ ++ /* ++ * Always try a new alloc if we fall back to the OOM bfqq ++ * originally, since it should just be a temporary situation. ++ */ ++ if (!bfqq || bfqq == &bfqd->oom_bfqq) { ++ bfqq = NULL; ++ if (new_bfqq) { ++ bfqq = new_bfqq; ++ new_bfqq = NULL; ++ } else if (gfpflags_allow_blocking(gfp_mask)) { ++ rcu_read_unlock(); ++ spin_unlock_irq(bfqd->queue->queue_lock); ++ new_bfqq = kmem_cache_alloc_node(bfq_pool, ++ gfp_mask | __GFP_ZERO, ++ bfqd->queue->node); ++ spin_lock_irq(bfqd->queue->queue_lock); ++ if (new_bfqq) ++ goto retry; ++ } else { ++ bfqq = kmem_cache_alloc_node(bfq_pool, ++ gfp_mask | __GFP_ZERO, ++ bfqd->queue->node); ++ } ++ ++ if (bfqq) { ++ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, ++ is_sync); ++ bfq_init_entity(&bfqq->entity, bfqg); ++ bfq_log_bfqq(bfqd, bfqq, "allocated"); ++ } else { ++ bfqq = &bfqd->oom_bfqq; ++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); ++ } ++ } ++ ++ if (new_bfqq) ++ kmem_cache_free(bfq_pool, new_bfqq); ++ ++ rcu_read_unlock(); ++ ++ return bfqq; ++} ++ ++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ int ioprio_class, int ioprio) ++{ ++ switch (ioprio_class) { ++ case IOPRIO_CLASS_RT: ++ return &bfqg->async_bfqq[0][ioprio]; ++ case IOPRIO_CLASS_NONE: ++ ioprio = IOPRIO_NORM; ++ /* fall through */ ++ case IOPRIO_CLASS_BE: ++ return &bfqg->async_bfqq[1][ioprio]; ++ case IOPRIO_CLASS_IDLE: ++ return &bfqg->async_idle_bfqq; ++ default: ++ BUG(); ++ } ++} ++ ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bio *bio, int is_sync, ++ struct bfq_io_cq *bic, gfp_t gfp_mask) ++{ ++ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); ++ struct bfq_queue **async_bfqq = NULL; ++ struct bfq_queue *bfqq = NULL; ++ ++ if (!is_sync) { ++ struct blkcg *blkcg; ++ struct bfq_group *bfqg; ++ ++ rcu_read_lock(); ++ blkcg = bio_blkcg(bio); ++ rcu_read_unlock(); ++ bfqg = bfq_find_alloc_group(bfqd, blkcg); ++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ++ ioprio); ++ bfqq = *async_bfqq; ++ } ++ ++ if (!bfqq) ++ bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask); ++ ++ /* ++ * Pin the queue now that it's allocated, scheduler exit will ++ * prune it. ++ */ ++ if (!is_sync && !(*async_bfqq)) { ++ atomic_inc(&bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ *async_bfqq = bfqq; ++ } ++ ++ atomic_inc(&bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, ++ atomic_read(&bfqq->ref)); ++ return bfqq; ++} ++ ++static void bfq_update_io_thinktime(struct bfq_data *bfqd, ++ struct bfq_io_cq *bic) ++{ ++ unsigned long elapsed = jiffies - bic->ttime.last_end_request; ++ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); ++ ++ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; ++ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; ++ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / ++ bic->ttime.ttime_samples; ++} ++ ++static void bfq_update_io_seektime(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ sector_t sdist; ++ u64 total; ++ ++ if (bfqq->last_request_pos < blk_rq_pos(rq)) ++ sdist = blk_rq_pos(rq) - bfqq->last_request_pos; ++ else ++ sdist = bfqq->last_request_pos - blk_rq_pos(rq); ++ ++ /* ++ * Don't allow the seek distance to get too large from the ++ * odd fragment, pagein, etc. ++ */ ++ if (bfqq->seek_samples == 0) /* first request, not really a seek */ ++ sdist = 0; ++ else if (bfqq->seek_samples <= 60) /* second & third seek */ ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); ++ else ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); ++ ++ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; ++ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; ++ total = bfqq->seek_total + (bfqq->seek_samples/2); ++ do_div(total, bfqq->seek_samples); ++ bfqq->seek_mean = (sector_t)total; ++ ++ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, ++ (u64)bfqq->seek_mean); ++} ++ ++/* ++ * Disable idle window if the process thinks too long or seeks so much that ++ * it doesn't matter. ++ */ ++static void bfq_update_idle_window(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic) ++{ ++ int enable_idle; ++ ++ /* Don't idle for async or idle io prio class. */ ++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) ++ return; ++ ++ enable_idle = bfq_bfqq_idle_window(bfqq); ++ ++ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || ++ bfqd->bfq_slice_idle == 0 || ++ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && ++ bfqq->wr_coeff == 1)) ++ enable_idle = 0; ++ else if (bfq_sample_valid(bic->ttime.ttime_samples)) { ++ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && ++ bfqq->wr_coeff == 1) ++ enable_idle = 0; ++ else ++ enable_idle = 1; ++ } ++ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", ++ enable_idle); ++ ++ if (enable_idle) ++ bfq_mark_bfqq_idle_window(bfqq); ++ else ++ bfq_clear_bfqq_idle_window(bfqq); ++} ++ ++/* ++ * Called when a new fs request (rq) is added to bfqq. Check if there's ++ * something we should do about it. ++ */ ++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ struct bfq_io_cq *bic = RQ_BIC(rq); ++ ++ if (rq->cmd_flags & REQ_META) ++ bfqq->meta_pending++; ++ ++ bfq_update_io_thinktime(bfqd, bic); ++ bfq_update_io_seektime(bfqd, bfqq, rq); ++ if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { ++ bfq_clear_bfqq_constantly_seeky(bfqq); ++ if (!blk_queue_nonrot(bfqd->queue)) { ++ BUG_ON(!bfqd->const_seeky_busy_in_flight_queues); ++ bfqd->const_seeky_busy_in_flight_queues--; ++ } ++ } ++ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || ++ !BFQQ_SEEKY(bfqq)) ++ bfq_update_idle_window(bfqd, bfqq, bic); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", ++ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), ++ (long long unsigned)bfqq->seek_mean); ++ ++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ ++ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { ++ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && ++ blk_rq_sectors(rq) < 32; ++ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); ++ ++ /* ++ * There is just this request queued: if the request ++ * is small and the queue is not to be expired, then ++ * just exit. ++ * ++ * In this way, if the disk is being idled to wait for ++ * a new request from the in-service queue, we avoid ++ * unplugging the device and committing the disk to serve ++ * just a small request. On the contrary, we wait for ++ * the block layer to decide when to unplug the device: ++ * hopefully, new requests will be merged to this one ++ * quickly, then the device will be unplugged and ++ * larger requests will be dispatched. ++ */ ++ if (small_req && !budget_timeout) ++ return; ++ ++ /* ++ * A large enough request arrived, or the queue is to ++ * be expired: in both cases disk idling is to be ++ * stopped, so clear wait_request flag and reset ++ * timer. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ del_timer(&bfqd->idle_slice_timer); ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ bfqg_stats_update_idle_time(bfqq_group(bfqq)); ++#endif ++ ++ /* ++ * The queue is not empty, because a new request just ++ * arrived. Hence we can safely expire the queue, in ++ * case of budget timeout, without risking that the ++ * timestamps of the queue are not updated correctly. ++ * See [1] for more details. ++ */ ++ if (budget_timeout) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_BUDGET_TIMEOUT); ++ ++ /* ++ * Let the request rip immediately, or let a new queue be ++ * selected if bfqq has just been expired. ++ */ ++ __blk_run_queue(bfqd->queue); ++ } ++} ++ ++static void bfq_insert_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ assert_spin_locked(bfqd->queue->queue_lock); ++ ++ bfq_add_request(rq); ++ ++ rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; ++ list_add_tail(&rq->queuelist, &bfqq->fifo); ++ ++ bfq_rq_enqueued(bfqd, bfqq, rq); ++} ++ ++static void bfq_update_hw_tag(struct bfq_data *bfqd) ++{ ++ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, ++ bfqd->rq_in_driver); ++ ++ if (bfqd->hw_tag == 1) ++ return; ++ ++ /* ++ * This sample is valid if the number of outstanding requests ++ * is large enough to allow a queueing behavior. Note that the ++ * sum is not exact, as it's not taking into account deactivated ++ * requests. ++ */ ++ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) ++ return; ++ ++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) ++ return; ++ ++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; ++ bfqd->max_rq_in_driver = 0; ++ bfqd->hw_tag_samples = 0; ++} ++ ++static void bfq_completed_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ bool sync = bfq_bfqq_sync(bfqq); ++ ++ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)", ++ blk_rq_sectors(rq), sync); ++ ++ bfq_update_hw_tag(bfqd); ++ ++ BUG_ON(!bfqd->rq_in_driver); ++ BUG_ON(!bfqq->dispatched); ++ bfqd->rq_in_driver--; ++ bfqq->dispatched--; ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ bfqg_stats_update_completion(bfqq_group(bfqq), ++ rq_start_time_ns(rq), ++ rq_io_start_time_ns(rq), rq->cmd_flags); ++#endif ++ ++ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { ++ bfq_weights_tree_remove(bfqd, &bfqq->entity, ++ &bfqd->queue_weights_tree); ++ if (!blk_queue_nonrot(bfqd->queue)) { ++ BUG_ON(!bfqd->busy_in_flight_queues); ++ bfqd->busy_in_flight_queues--; ++ if (bfq_bfqq_constantly_seeky(bfqq)) { ++ BUG_ON(!bfqd-> ++ const_seeky_busy_in_flight_queues); ++ bfqd->const_seeky_busy_in_flight_queues--; ++ } ++ } ++ } ++ ++ if (sync) { ++ bfqd->sync_flight--; ++ RQ_BIC(rq)->ttime.last_end_request = jiffies; ++ } ++ ++ /* ++ * If we are waiting to discover whether the request pattern of the ++ * task associated with the queue is actually isochronous, and ++ * both requisites for this condition to hold are satisfied, then ++ * compute soft_rt_next_start (see the comments to the function ++ * bfq_bfqq_softrt_next_start()). ++ */ ++ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && ++ RB_EMPTY_ROOT(&bfqq->sort_list)) ++ bfqq->soft_rt_next_start = ++ bfq_bfqq_softrt_next_start(bfqd, bfqq); ++ ++ /* ++ * If this is the in-service queue, check if it needs to be expired, ++ * or if we want to idle in case it has no pending requests. ++ */ ++ if (bfqd->in_service_queue == bfqq) { ++ if (bfq_bfqq_budget_new(bfqq)) ++ bfq_set_budget_timeout(bfqd); ++ ++ if (bfq_bfqq_must_idle(bfqq)) { ++ bfq_arm_slice_timer(bfqd); ++ goto out; ++ } else if (bfq_may_expire_for_budg_timeout(bfqq)) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_BUDGET_TIMEOUT); ++ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && ++ (bfqq->dispatched == 0 || ++ !bfq_bfqq_may_idle(bfqq))) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_NO_MORE_REQUESTS); ++ } ++ ++ if (!bfqd->rq_in_driver) ++ bfq_schedule_dispatch(bfqd); ++ ++out: ++ return; ++} ++ ++static int __bfq_may_queue(struct bfq_queue *bfqq) ++{ ++ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { ++ bfq_clear_bfqq_must_alloc(bfqq); ++ return ELV_MQUEUE_MUST; ++ } ++ ++ return ELV_MQUEUE_MAY; ++} ++ ++static int bfq_may_queue(struct request_queue *q, int rw) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct task_struct *tsk = current; ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq; ++ ++ /* ++ * Don't force setup of a queue from here, as a call to may_queue ++ * does not necessarily imply that a request actually will be ++ * queued. So just lookup a possibly existing queue, or return ++ * 'may queue' if that fails. ++ */ ++ bic = bfq_bic_lookup(bfqd, tsk->io_context); ++ if (!bic) ++ return ELV_MQUEUE_MAY; ++ ++ bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); ++ if (bfqq) ++ return __bfq_may_queue(bfqq); ++ ++ return ELV_MQUEUE_MAY; ++} ++ ++/* ++ * Queue lock held here. ++ */ ++static void bfq_put_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ if (bfqq) { ++ const int rw = rq_data_dir(rq); ++ ++ BUG_ON(!bfqq->allocated[rw]); ++ bfqq->allocated[rw]--; ++ ++ rq->elv.priv[0] = NULL; ++ rq->elv.priv[1] = NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ bfq_put_queue(bfqq); ++ } ++} ++ ++/* ++ * Allocate bfq data structures associated with this request. ++ */ ++static int bfq_set_request(struct request_queue *q, struct request *rq, ++ struct bio *bio, gfp_t gfp_mask) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); ++ const int rw = rq_data_dir(rq); ++ const int is_sync = rq_is_sync(rq); ++ struct bfq_queue *bfqq; ++ unsigned long flags; ++ ++ might_sleep_if(gfpflags_allow_blocking(gfp_mask)); ++ ++ bfq_check_ioprio_change(bic, bio); ++ ++ spin_lock_irqsave(q->queue_lock, flags); ++ ++ if (!bic) ++ goto queue_fail; ++ ++ bfq_bic_update_cgroup(bic, bio); ++ ++ bfqq = bic_to_bfqq(bic, is_sync); ++ if (!bfqq || bfqq == &bfqd->oom_bfqq) { ++ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); ++ bic_set_bfqq(bic, bfqq, is_sync); ++ if (is_sync) { ++ if (bfqd->large_burst) ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ else ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ } ++ } ++ ++ bfqq->allocated[rw]++; ++ atomic_inc(&bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, ++ atomic_read(&bfqq->ref)); ++ ++ rq->elv.priv[0] = bic; ++ rq->elv.priv[1] = bfqq; ++ ++ spin_unlock_irqrestore(q->queue_lock, flags); ++ ++ return 0; ++ ++queue_fail: ++ bfq_schedule_dispatch(bfqd); ++ spin_unlock_irqrestore(q->queue_lock, flags); ++ ++ return 1; ++} ++ ++static void bfq_kick_queue(struct work_struct *work) ++{ ++ struct bfq_data *bfqd = ++ container_of(work, struct bfq_data, unplug_work); ++ struct request_queue *q = bfqd->queue; ++ ++ spin_lock_irq(q->queue_lock); ++ __blk_run_queue(q); ++ spin_unlock_irq(q->queue_lock); ++} ++ ++/* ++ * Handler of the expiration of the timer running if the in-service queue ++ * is idling inside its time slice. ++ */ ++static void bfq_idle_slice_timer(unsigned long data) ++{ ++ struct bfq_data *bfqd = (struct bfq_data *)data; ++ struct bfq_queue *bfqq; ++ unsigned long flags; ++ enum bfqq_expiration reason; ++ ++ spin_lock_irqsave(bfqd->queue->queue_lock, flags); ++ ++ bfqq = bfqd->in_service_queue; ++ /* ++ * Theoretical race here: the in-service queue can be NULL or ++ * different from the queue that was idling if the timer handler ++ * spins on the queue_lock and a new request arrives for the ++ * current queue and there is a full dispatch cycle that changes ++ * the in-service queue. This can hardly happen, but in the worst ++ * case we just expire a queue too early. ++ */ ++ if (bfqq) { ++ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); ++ if (bfq_bfqq_budget_timeout(bfqq)) ++ /* ++ * Also here the queue can be safely expired ++ * for budget timeout without wasting ++ * guarantees ++ */ ++ reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) ++ /* ++ * The queue may not be empty upon timer expiration, ++ * because we may not disable the timer when the ++ * first request of the in-service queue arrives ++ * during disk idling. ++ */ ++ reason = BFQ_BFQQ_TOO_IDLE; ++ else ++ goto schedule_dispatch; ++ ++ bfq_bfqq_expire(bfqd, bfqq, true, reason); ++ } ++ ++schedule_dispatch: ++ bfq_schedule_dispatch(bfqd); ++ ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); ++} ++ ++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) ++{ ++ del_timer_sync(&bfqd->idle_slice_timer); ++ cancel_work_sync(&bfqd->unplug_work); ++} ++ ++static void __bfq_put_async_bfqq(struct bfq_data *bfqd, ++ struct bfq_queue **bfqq_ptr) ++{ ++ struct bfq_group *root_group = bfqd->root_group; ++ struct bfq_queue *bfqq = *bfqq_ptr; ++ ++ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); ++ if (bfqq) { ++ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); ++ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ bfq_put_queue(bfqq); ++ *bfqq_ptr = NULL; ++ } ++} ++ ++/* ++ * Release all the bfqg references to its async queues. If we are ++ * deallocating the group these queues may still contain requests, so ++ * we reparent them to the root cgroup (i.e., the only one that will ++ * exist for sure until all the requests on a device are gone). ++ */ ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) ++{ ++ int i, j; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_BE_NR; j++) ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); ++ ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); ++} ++ ++static void bfq_exit_queue(struct elevator_queue *e) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ struct request_queue *q = bfqd->queue; ++ struct bfq_queue *bfqq, *n; ++ ++ bfq_shutdown_timer_wq(bfqd); ++ ++ spin_lock_irq(q->queue_lock); ++ ++ BUG_ON(bfqd->in_service_queue); ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) ++ bfq_deactivate_bfqq(bfqd, bfqq, 0); ++ ++ spin_unlock_irq(q->queue_lock); ++ ++ bfq_shutdown_timer_wq(bfqd); ++ ++ synchronize_rcu(); ++ ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); ++ ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ blkcg_deactivate_policy(q, &blkcg_policy_bfq); ++#else ++ kfree(bfqd->root_group); ++#endif ++ ++ kfree(bfqd); ++} ++ ++static void bfq_init_root_group(struct bfq_group *root_group, ++ struct bfq_data *bfqd) ++{ ++ int i; ++ ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ root_group->entity.parent = NULL; ++ root_group->my_entity = NULL; ++ root_group->bfqd = bfqd; ++#endif ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) ++ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; ++} ++ ++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) ++{ ++ struct bfq_data *bfqd; ++ struct elevator_queue *eq; ++ ++ eq = elevator_alloc(q, e); ++ if (!eq) ++ return -ENOMEM; ++ ++ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); ++ if (!bfqd) { ++ kobject_put(&eq->kobj); ++ return -ENOMEM; ++ } ++ eq->elevator_data = bfqd; ++ ++ /* ++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. ++ * Grab a permanent reference to it, so that the normal code flow ++ * will not attempt to free it. ++ */ ++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); ++ atomic_inc(&bfqd->oom_bfqq.ref); ++ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; ++ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; ++ bfqd->oom_bfqq.entity.new_weight = ++ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); ++ /* ++ * Trigger weight initialization, according to ioprio, at the ++ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio ++ * class won't be changed any more. ++ */ ++ bfqd->oom_bfqq.entity.prio_changed = 1; ++ ++ bfqd->queue = q; ++ ++ spin_lock_irq(q->queue_lock); ++ q->elevator = eq; ++ spin_unlock_irq(q->queue_lock); ++ ++ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); ++ if (!bfqd->root_group) ++ goto out_free; ++ bfq_init_root_group(bfqd->root_group, bfqd); ++ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ bfqd->active_numerous_groups = 0; ++#endif ++ ++ init_timer(&bfqd->idle_slice_timer); ++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; ++ bfqd->idle_slice_timer.data = (unsigned long)bfqd; ++ ++ bfqd->queue_weights_tree = RB_ROOT; ++ bfqd->group_weights_tree = RB_ROOT; ++ ++ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); ++ ++ INIT_LIST_HEAD(&bfqd->active_list); ++ INIT_LIST_HEAD(&bfqd->idle_list); ++ INIT_HLIST_HEAD(&bfqd->burst_list); ++ ++ bfqd->hw_tag = -1; ++ ++ bfqd->bfq_max_budget = bfq_default_max_budget; ++ ++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; ++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; ++ bfqd->bfq_back_max = bfq_back_max; ++ bfqd->bfq_back_penalty = bfq_back_penalty; ++ bfqd->bfq_slice_idle = bfq_slice_idle; ++ bfqd->bfq_class_idle_last_service = 0; ++ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; ++ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; ++ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; ++ ++ bfqd->bfq_requests_within_timer = 120; ++ ++ bfqd->bfq_large_burst_thresh = 11; ++ bfqd->bfq_burst_interval = msecs_to_jiffies(500); ++ ++ bfqd->low_latency = true; ++ ++ bfqd->bfq_wr_coeff = 20; ++ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); ++ bfqd->bfq_wr_max_time = 0; ++ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); ++ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); ++ bfqd->bfq_wr_max_softrt_rate = 7000; /* ++ * Approximate rate required ++ * to playback or record a ++ * high-definition compressed ++ * video. ++ */ ++ bfqd->wr_busy_queues = 0; ++ bfqd->busy_in_flight_queues = 0; ++ bfqd->const_seeky_busy_in_flight_queues = 0; ++ ++ /* ++ * Begin by assuming, optimistically, that the device peak rate is ++ * equal to the highest reference rate. ++ */ ++ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * ++ T_fast[blk_queue_nonrot(bfqd->queue)]; ++ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)]; ++ bfqd->device_speed = BFQ_BFQD_FAST; ++ ++ return 0; ++ ++out_free: ++ kfree(bfqd); ++ kobject_put(&eq->kobj); ++ return -ENOMEM; ++} ++ ++static void bfq_slab_kill(void) ++{ ++ if (bfq_pool) ++ kmem_cache_destroy(bfq_pool); ++} ++ ++static int __init bfq_slab_setup(void) ++{ ++ bfq_pool = KMEM_CACHE(bfq_queue, 0); ++ if (!bfq_pool) ++ return -ENOMEM; ++ return 0; ++} ++ ++static ssize_t bfq_var_show(unsigned int var, char *page) ++{ ++ return sprintf(page, "%d\n", var); ++} ++ ++static ssize_t bfq_var_store(unsigned long *var, const char *page, ++ size_t count) ++{ ++ unsigned long new_val; ++ int ret = kstrtoul(page, 10, &new_val); ++ ++ if (ret == 0) ++ *var = new_val; ++ ++ return count; ++} ++ ++static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? ++ jiffies_to_msecs(bfqd->bfq_wr_max_time) : ++ jiffies_to_msecs(bfq_wr_duration(bfqd))); ++} ++ ++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_queue *bfqq; ++ struct bfq_data *bfqd = e->elevator_data; ++ ssize_t num_char = 0; ++ ++ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", ++ bfqd->queued); ++ ++ spin_lock_irq(bfqd->queue->queue_lock); ++ ++ num_char += sprintf(page + num_char, "Active:\n"); ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, nr_queued %d %d, dur %d/%u\n", ++ bfqq->pid, ++ bfqq->entity.weight, ++ bfqq->queued[0], ++ bfqq->queued[1], ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ num_char += sprintf(page + num_char, "Idle:\n"); ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, dur %d/%u\n", ++ bfqq->pid, ++ bfqq->entity.weight, ++ jiffies_to_msecs(jiffies - ++ bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ spin_unlock_irq(bfqd->queue->queue_lock); ++ ++ return num_char; ++} ++ ++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned int __data = __VAR; \ ++ if (__CONV) \ ++ __data = jiffies_to_msecs(__data); \ ++ return bfq_var_show(__data, (page)); \ ++} ++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); ++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); ++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); ++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); ++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); ++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); ++SHOW_FUNCTION(bfq_max_budget_async_rq_show, ++ bfqd->bfq_max_budget_async_rq, 0); ++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); ++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); ++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); ++SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); ++SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); ++SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); ++SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, ++ 1); ++SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); ++#undef SHOW_FUNCTION ++ ++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ ++static ssize_t \ ++__FUNC(struct elevator_queue *e, const char *page, size_t count) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned long uninitialized_var(__data); \ ++ int ret = bfq_var_store(&__data, (page), count); \ ++ if (__data < (MIN)) \ ++ __data = (MIN); \ ++ else if (__data > (MAX)) \ ++ __data = (MAX); \ ++ if (__CONV) \ ++ *(__PTR) = msecs_to_jiffies(__data); \ ++ else \ ++ *(__PTR) = __data; \ ++ return ret; \ ++} ++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); ++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, ++ INT_MAX, 0); ++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, ++ 1, INT_MAX, 0); ++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); ++STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, ++ 1); ++STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, ++ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, ++ INT_MAX, 0); ++#undef STORE_FUNCTION ++ ++/* do nothing for the moment */ ++static ssize_t bfq_weights_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ return count; ++} ++ ++static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) ++{ ++ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); ++ ++ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) ++ return bfq_calc_max_budget(bfqd->peak_rate, timeout); ++ else ++ return bfq_default_max_budget; ++} ++ ++static ssize_t bfq_max_budget_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data == 0) ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); ++ else { ++ if (__data > INT_MAX) ++ __data = INT_MAX; ++ bfqd->bfq_max_budget = __data; ++ } ++ ++ bfqd->bfq_user_max_budget = __data; ++ ++ return ret; ++} ++ ++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data < 1) ++ __data = 1; ++ else if (__data > INT_MAX) ++ __data = INT_MAX; ++ ++ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); ++ if (bfqd->bfq_user_max_budget == 0) ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); ++ ++ return ret; ++} ++ ++static ssize_t bfq_low_latency_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data > 1) ++ __data = 1; ++ if (__data == 0 && bfqd->low_latency != 0) ++ bfq_end_wr(bfqd); ++ bfqd->low_latency = __data; ++ ++ return ret; ++} ++ ++#define BFQ_ATTR(name) \ ++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) ++ ++static struct elv_fs_entry bfq_attrs[] = { ++ BFQ_ATTR(fifo_expire_sync), ++ BFQ_ATTR(fifo_expire_async), ++ BFQ_ATTR(back_seek_max), ++ BFQ_ATTR(back_seek_penalty), ++ BFQ_ATTR(slice_idle), ++ BFQ_ATTR(max_budget), ++ BFQ_ATTR(max_budget_async_rq), ++ BFQ_ATTR(timeout_sync), ++ BFQ_ATTR(timeout_async), ++ BFQ_ATTR(low_latency), ++ BFQ_ATTR(wr_coeff), ++ BFQ_ATTR(wr_max_time), ++ BFQ_ATTR(wr_rt_max_time), ++ BFQ_ATTR(wr_min_idle_time), ++ BFQ_ATTR(wr_min_inter_arr_async), ++ BFQ_ATTR(wr_max_softrt_rate), ++ BFQ_ATTR(weights), ++ __ATTR_NULL ++}; ++ ++static struct elevator_type iosched_bfq = { ++ .ops = { ++ .elevator_merge_fn = bfq_merge, ++ .elevator_merged_fn = bfq_merged_request, ++ .elevator_merge_req_fn = bfq_merged_requests, ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ .elevator_bio_merged_fn = bfq_bio_merged, ++#endif ++ .elevator_allow_merge_fn = bfq_allow_merge, ++ .elevator_dispatch_fn = bfq_dispatch_requests, ++ .elevator_add_req_fn = bfq_insert_request, ++ .elevator_activate_req_fn = bfq_activate_request, ++ .elevator_deactivate_req_fn = bfq_deactivate_request, ++ .elevator_completed_req_fn = bfq_completed_request, ++ .elevator_former_req_fn = elv_rb_former_request, ++ .elevator_latter_req_fn = elv_rb_latter_request, ++ .elevator_init_icq_fn = bfq_init_icq, ++ .elevator_exit_icq_fn = bfq_exit_icq, ++ .elevator_set_req_fn = bfq_set_request, ++ .elevator_put_req_fn = bfq_put_request, ++ .elevator_may_queue_fn = bfq_may_queue, ++ .elevator_init_fn = bfq_init_queue, ++ .elevator_exit_fn = bfq_exit_queue, ++ }, ++ .icq_size = sizeof(struct bfq_io_cq), ++ .icq_align = __alignof__(struct bfq_io_cq), ++ .elevator_attrs = bfq_attrs, ++ .elevator_name = "bfq", ++ .elevator_owner = THIS_MODULE, ++}; ++ ++static int __init bfq_init(void) ++{ ++ int ret; ++ ++ /* ++ * Can be 0 on HZ < 1000 setups. ++ */ ++ if (bfq_slice_idle == 0) ++ bfq_slice_idle = 1; ++ ++ if (bfq_timeout_async == 0) ++ bfq_timeout_async = 1; ++ ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ ret = blkcg_policy_register(&blkcg_policy_bfq); ++ if (ret) ++ return ret; ++#endif ++ ++ ret = -ENOMEM; ++ if (bfq_slab_setup()) ++ goto err_pol_unreg; ++ ++ /* ++ * Times to load large popular applications for the typical systems ++ * installed on the reference devices (see the comments before the ++ * definitions of the two arrays). ++ */ ++ T_slow[0] = msecs_to_jiffies(2600); ++ T_slow[1] = msecs_to_jiffies(1000); ++ T_fast[0] = msecs_to_jiffies(5500); ++ T_fast[1] = msecs_to_jiffies(2000); ++ ++ /* ++ * Thresholds that determine the switch between speed classes (see ++ * the comments before the definition of the array). ++ */ ++ device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; ++ device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; ++ ++ ret = elv_register(&iosched_bfq); ++ if (ret) ++ goto err_pol_unreg; ++ ++ pr_info("BFQ I/O-scheduler: v7r11"); ++ ++ return 0; ++ ++err_pol_unreg: ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ blkcg_policy_unregister(&blkcg_policy_bfq); ++#endif ++ return ret; ++} ++ ++static void __exit bfq_exit(void) ++{ ++ elv_unregister(&iosched_bfq); ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ blkcg_policy_unregister(&blkcg_policy_bfq); ++#endif ++ bfq_slab_kill(); ++} ++ ++module_init(bfq_init); ++module_exit(bfq_exit); ++ ++MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); ++MODULE_LICENSE("GPL"); +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +new file mode 100644 +index 0000000..a64fec1 +--- /dev/null ++++ b/block/bfq-sched.c +@@ -0,0 +1,1200 @@ ++/* ++ * BFQ: Hierarchical B-WF2Q+ scheduler. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2010 Paolo Valente ++ */ ++ ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++#define for_each_entity(entity) \ ++ for (; entity ; entity = entity->parent) ++ ++#define for_each_entity_safe(entity, parent) \ ++ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) ++ ++ ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, ++ int extract, ++ struct bfq_data *bfqd); ++ ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); ++ ++static void bfq_update_budget(struct bfq_entity *next_in_service) ++{ ++ struct bfq_entity *bfqg_entity; ++ struct bfq_group *bfqg; ++ struct bfq_sched_data *group_sd; ++ ++ BUG_ON(!next_in_service); ++ ++ group_sd = next_in_service->sched_data; ++ ++ bfqg = container_of(group_sd, struct bfq_group, sched_data); ++ /* ++ * bfq_group's my_entity field is not NULL only if the group ++ * is not the root group. We must not touch the root entity ++ * as it must never become an in-service entity. ++ */ ++ bfqg_entity = bfqg->my_entity; ++ if (bfqg_entity) ++ bfqg_entity->budget = next_in_service->budget; ++} ++ ++static int bfq_update_next_in_service(struct bfq_sched_data *sd) ++{ ++ struct bfq_entity *next_in_service; ++ ++ if (sd->in_service_entity) ++ /* will update/requeue at the end of service */ ++ return 0; ++ ++ /* ++ * NOTE: this can be improved in many ways, such as returning ++ * 1 (and thus propagating upwards the update) only when the ++ * budget changes, or caching the bfqq that will be scheduled ++ * next from this subtree. By now we worry more about ++ * correctness than about performance... ++ */ ++ next_in_service = bfq_lookup_next_entity(sd, 0, NULL); ++ sd->next_in_service = next_in_service; ++ ++ if (next_in_service) ++ bfq_update_budget(next_in_service); ++ ++ return 1; ++} ++ ++static void bfq_check_next_in_service(struct bfq_sched_data *sd, ++ struct bfq_entity *entity) ++{ ++ BUG_ON(sd->next_in_service != entity); ++} ++#else ++#define for_each_entity(entity) \ ++ for (; entity ; entity = NULL) ++ ++#define for_each_entity_safe(entity, parent) \ ++ for (parent = NULL; entity ; entity = parent) ++ ++static int bfq_update_next_in_service(struct bfq_sched_data *sd) ++{ ++ return 0; ++} ++ ++static void bfq_check_next_in_service(struct bfq_sched_data *sd, ++ struct bfq_entity *entity) ++{ ++} ++ ++static void bfq_update_budget(struct bfq_entity *next_in_service) ++{ ++} ++#endif ++ ++/* ++ * Shift for timestamp calculations. This actually limits the maximum ++ * service allowed in one timestamp delta (small shift values increase it), ++ * the maximum total weight that can be used for the queues in the system ++ * (big shift values increase it), and the period of virtual time ++ * wraparounds. ++ */ ++#define WFQ_SERVICE_SHIFT 22 ++ ++/** ++ * bfq_gt - compare two timestamps. ++ * @a: first ts. ++ * @b: second ts. ++ * ++ * Return @a > @b, dealing with wrapping correctly. ++ */ ++static int bfq_gt(u64 a, u64 b) ++{ ++ return (s64)(a - b) > 0; ++} ++ ++static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = NULL; ++ ++ BUG_ON(!entity); ++ ++ if (!entity->my_sched_data) ++ bfqq = container_of(entity, struct bfq_queue, entity); ++ ++ return bfqq; ++} ++ ++ ++/** ++ * bfq_delta - map service into the virtual time domain. ++ * @service: amount of service. ++ * @weight: scale factor (weight of an entity or weight sum). ++ */ ++static u64 bfq_delta(unsigned long service, unsigned long weight) ++{ ++ u64 d = (u64)service << WFQ_SERVICE_SHIFT; ++ ++ do_div(d, weight); ++ return d; ++} ++ ++/** ++ * bfq_calc_finish - assign the finish time to an entity. ++ * @entity: the entity to act upon. ++ * @service: the service to be charged to the entity. ++ */ ++static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ BUG_ON(entity->weight == 0); ++ ++ entity->finish = entity->start + ++ bfq_delta(service, entity->weight); ++ ++ if (bfqq) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "calc_finish: serv %lu, w %d", ++ service, entity->weight); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "calc_finish: start %llu, finish %llu, delta %llu", ++ entity->start, entity->finish, ++ bfq_delta(service, entity->weight)); ++ } ++} ++ ++/** ++ * bfq_entity_of - get an entity from a node. ++ * @node: the node field of the entity. ++ * ++ * Convert a node pointer to the relative entity. This is used only ++ * to simplify the logic of some functions and not as the generic ++ * conversion mechanism because, e.g., in the tree walking functions, ++ * the check for a %NULL value would be redundant. ++ */ ++static struct bfq_entity *bfq_entity_of(struct rb_node *node) ++{ ++ struct bfq_entity *entity = NULL; ++ ++ if (node) ++ entity = rb_entry(node, struct bfq_entity, rb_node); ++ ++ return entity; ++} ++ ++/** ++ * bfq_extract - remove an entity from a tree. ++ * @root: the tree root. ++ * @entity: the entity to remove. ++ */ ++static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) ++{ ++ BUG_ON(entity->tree != root); ++ ++ entity->tree = NULL; ++ rb_erase(&entity->rb_node, root); ++} ++ ++/** ++ * bfq_idle_extract - extract an entity from the idle tree. ++ * @st: the service tree of the owning @entity. ++ * @entity: the entity being removed. ++ */ ++static void bfq_idle_extract(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *next; ++ ++ BUG_ON(entity->tree != &st->idle); ++ ++ if (entity == st->first_idle) { ++ next = rb_next(&entity->rb_node); ++ st->first_idle = bfq_entity_of(next); ++ } ++ ++ if (entity == st->last_idle) { ++ next = rb_prev(&entity->rb_node); ++ st->last_idle = bfq_entity_of(next); ++ } ++ ++ bfq_extract(&st->idle, entity); ++ ++ if (bfqq) ++ list_del(&bfqq->bfqq_list); ++} ++ ++/** ++ * bfq_insert - generic tree insertion. ++ * @root: tree root. ++ * @entity: entity to insert. ++ * ++ * This is used for the idle and the active tree, since they are both ++ * ordered by finish time. ++ */ ++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) ++{ ++ struct bfq_entity *entry; ++ struct rb_node **node = &root->rb_node; ++ struct rb_node *parent = NULL; ++ ++ BUG_ON(entity->tree); ++ ++ while (*node) { ++ parent = *node; ++ entry = rb_entry(parent, struct bfq_entity, rb_node); ++ ++ if (bfq_gt(entry->finish, entity->finish)) ++ node = &parent->rb_left; ++ else ++ node = &parent->rb_right; ++ } ++ ++ rb_link_node(&entity->rb_node, parent, node); ++ rb_insert_color(&entity->rb_node, root); ++ ++ entity->tree = root; ++} ++ ++/** ++ * bfq_update_min - update the min_start field of a entity. ++ * @entity: the entity to update. ++ * @node: one of its children. ++ * ++ * This function is called when @entity may store an invalid value for ++ * min_start due to updates to the active tree. The function assumes ++ * that the subtree rooted at @node (which may be its left or its right ++ * child) has a valid min_start value. ++ */ ++static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) ++{ ++ struct bfq_entity *child; ++ ++ if (node) { ++ child = rb_entry(node, struct bfq_entity, rb_node); ++ if (bfq_gt(entity->min_start, child->min_start)) ++ entity->min_start = child->min_start; ++ } ++} ++ ++/** ++ * bfq_update_active_node - recalculate min_start. ++ * @node: the node to update. ++ * ++ * @node may have changed position or one of its children may have moved, ++ * this function updates its min_start value. The left and right subtrees ++ * are assumed to hold a correct min_start value. ++ */ ++static void bfq_update_active_node(struct rb_node *node) ++{ ++ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); ++ ++ entity->min_start = entity->start; ++ bfq_update_min(entity, node->rb_right); ++ bfq_update_min(entity, node->rb_left); ++} ++ ++/** ++ * bfq_update_active_tree - update min_start for the whole active tree. ++ * @node: the starting node. ++ * ++ * @node must be the deepest modified node after an update. This function ++ * updates its min_start using the values held by its children, assuming ++ * that they did not change, and then updates all the nodes that may have ++ * changed in the path to the root. The only nodes that may have changed ++ * are the ones in the path or their siblings. ++ */ ++static void bfq_update_active_tree(struct rb_node *node) ++{ ++ struct rb_node *parent; ++ ++up: ++ bfq_update_active_node(node); ++ ++ parent = rb_parent(node); ++ if (!parent) ++ return; ++ ++ if (node == parent->rb_left && parent->rb_right) ++ bfq_update_active_node(parent->rb_right); ++ else if (parent->rb_left) ++ bfq_update_active_node(parent->rb_left); ++ ++ node = parent; ++ goto up; ++} ++ ++static void bfq_weights_tree_add(struct bfq_data *bfqd, ++ struct bfq_entity *entity, ++ struct rb_root *root); ++ ++static void bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_entity *entity, ++ struct rb_root *root); ++ ++ ++/** ++ * bfq_active_insert - insert an entity in the active tree of its ++ * group/device. ++ * @st: the service tree of the entity. ++ * @entity: the entity being inserted. ++ * ++ * The active tree is ordered by finish time, but an extra key is kept ++ * per each node, containing the minimum value for the start times of ++ * its children (and the node itself), so it's possible to search for ++ * the eligible node with the lowest finish time in logarithmic time. ++ */ ++static void bfq_active_insert(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *node = &entity->rb_node; ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ struct bfq_sched_data *sd = NULL; ++ struct bfq_group *bfqg = NULL; ++ struct bfq_data *bfqd = NULL; ++#endif ++ ++ bfq_insert(&st->active, entity); ++ ++ if (node->rb_left) ++ node = node->rb_left; ++ else if (node->rb_right) ++ node = node->rb_right; ++ ++ bfq_update_active_tree(node); ++ ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ sd = entity->sched_data; ++ bfqg = container_of(sd, struct bfq_group, sched_data); ++ BUG_ON(!bfqg); ++ bfqd = (struct bfq_data *)bfqg->bfqd; ++#endif ++ if (bfqq) ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ else { /* bfq_group */ ++ BUG_ON(!bfqd); ++ bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); ++ } ++ if (bfqg != bfqd->root_group) { ++ BUG_ON(!bfqg); ++ BUG_ON(!bfqd); ++ bfqg->active_entities++; ++ if (bfqg->active_entities == 2) ++ bfqd->active_numerous_groups++; ++ } ++#endif ++} ++ ++/** ++ * bfq_ioprio_to_weight - calc a weight from an ioprio. ++ * @ioprio: the ioprio value to convert. ++ */ ++static unsigned short bfq_ioprio_to_weight(int ioprio) ++{ ++ BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); ++ return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio; ++} ++ ++/** ++ * bfq_weight_to_ioprio - calc an ioprio from a weight. ++ * @weight: the weight value to convert. ++ * ++ * To preserve as much as possible the old only-ioprio user interface, ++ * 0 is used as an escape ioprio value for weights (numerically) equal or ++ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. ++ */ ++static unsigned short bfq_weight_to_ioprio(int weight) ++{ ++ BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); ++ return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ? ++ 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight; ++} ++ ++static void bfq_get_entity(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ if (bfqq) { ++ atomic_inc(&bfqq->ref); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ } ++} ++ ++/** ++ * bfq_find_deepest - find the deepest node that an extraction can modify. ++ * @node: the node being removed. ++ * ++ * Do the first step of an extraction in an rb tree, looking for the ++ * node that will replace @node, and returning the deepest node that ++ * the following modifications to the tree can touch. If @node is the ++ * last node in the tree return %NULL. ++ */ ++static struct rb_node *bfq_find_deepest(struct rb_node *node) ++{ ++ struct rb_node *deepest; ++ ++ if (!node->rb_right && !node->rb_left) ++ deepest = rb_parent(node); ++ else if (!node->rb_right) ++ deepest = node->rb_left; ++ else if (!node->rb_left) ++ deepest = node->rb_right; ++ else { ++ deepest = rb_next(node); ++ if (deepest->rb_right) ++ deepest = deepest->rb_right; ++ else if (rb_parent(deepest) != node) ++ deepest = rb_parent(deepest); ++ } ++ ++ return deepest; ++} ++ ++/** ++ * bfq_active_extract - remove an entity from the active tree. ++ * @st: the service_tree containing the tree. ++ * @entity: the entity being removed. ++ */ ++static void bfq_active_extract(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *node; ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ struct bfq_sched_data *sd = NULL; ++ struct bfq_group *bfqg = NULL; ++ struct bfq_data *bfqd = NULL; ++#endif ++ ++ node = bfq_find_deepest(&entity->rb_node); ++ bfq_extract(&st->active, entity); ++ ++ if (node) ++ bfq_update_active_tree(node); ++ ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ sd = entity->sched_data; ++ bfqg = container_of(sd, struct bfq_group, sched_data); ++ BUG_ON(!bfqg); ++ bfqd = (struct bfq_data *)bfqg->bfqd; ++#endif ++ if (bfqq) ++ list_del(&bfqq->bfqq_list); ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ else { /* bfq_group */ ++ BUG_ON(!bfqd); ++ bfq_weights_tree_remove(bfqd, entity, ++ &bfqd->group_weights_tree); ++ } ++ if (bfqg != bfqd->root_group) { ++ BUG_ON(!bfqg); ++ BUG_ON(!bfqd); ++ BUG_ON(!bfqg->active_entities); ++ bfqg->active_entities--; ++ if (bfqg->active_entities == 1) { ++ BUG_ON(!bfqd->active_numerous_groups); ++ bfqd->active_numerous_groups--; ++ } ++ } ++#endif ++} ++ ++/** ++ * bfq_idle_insert - insert an entity into the idle tree. ++ * @st: the service tree containing the tree. ++ * @entity: the entity to insert. ++ */ ++static void bfq_idle_insert(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct bfq_entity *first_idle = st->first_idle; ++ struct bfq_entity *last_idle = st->last_idle; ++ ++ if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) ++ st->first_idle = entity; ++ if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) ++ st->last_idle = entity; ++ ++ bfq_insert(&st->idle, entity); ++ ++ if (bfqq) ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); ++} ++ ++/** ++ * bfq_forget_entity - remove an entity from the wfq trees. ++ * @st: the service tree. ++ * @entity: the entity being removed. ++ * ++ * Update the device status and forget everything about @entity, putting ++ * the device reference to it, if it is a queue. Entities belonging to ++ * groups are not refcounted. ++ */ ++static void bfq_forget_entity(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct bfq_sched_data *sd; ++ ++ BUG_ON(!entity->on_st); ++ ++ entity->on_st = 0; ++ st->wsum -= entity->weight; ++ if (bfqq) { ++ sd = entity->sched_data; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ bfq_put_queue(bfqq); ++ } ++} ++ ++/** ++ * bfq_put_idle_entity - release the idle tree ref of an entity. ++ * @st: service tree for the entity. ++ * @entity: the entity being released. ++ */ ++static void bfq_put_idle_entity(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ bfq_idle_extract(st, entity); ++ bfq_forget_entity(st, entity); ++} ++ ++/** ++ * bfq_forget_idle - update the idle tree if necessary. ++ * @st: the service tree to act upon. ++ * ++ * To preserve the global O(log N) complexity we only remove one entry here; ++ * as the idle tree will not grow indefinitely this can be done safely. ++ */ ++static void bfq_forget_idle(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *first_idle = st->first_idle; ++ struct bfq_entity *last_idle = st->last_idle; ++ ++ if (RB_EMPTY_ROOT(&st->active) && last_idle && ++ !bfq_gt(last_idle->finish, st->vtime)) { ++ /* ++ * Forget the whole idle tree, increasing the vtime past ++ * the last finish time of idle entities. ++ */ ++ st->vtime = last_idle->finish; ++ } ++ ++ if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) ++ bfq_put_idle_entity(st, first_idle); ++} ++ ++static struct bfq_service_tree * ++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_service_tree *new_st = old_st; ++ ++ if (entity->prio_changed) { ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ unsigned short prev_weight, new_weight; ++ struct bfq_data *bfqd = NULL; ++ struct rb_root *root; ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ struct bfq_sched_data *sd; ++ struct bfq_group *bfqg; ++#endif ++ ++ if (bfqq) ++ bfqd = bfqq->bfqd; ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ else { ++ sd = entity->my_sched_data; ++ bfqg = container_of(sd, struct bfq_group, sched_data); ++ BUG_ON(!bfqg); ++ bfqd = (struct bfq_data *)bfqg->bfqd; ++ BUG_ON(!bfqd); ++ } ++#endif ++ ++ BUG_ON(old_st->wsum < entity->weight); ++ old_st->wsum -= entity->weight; ++ ++ if (entity->new_weight != entity->orig_weight) { ++ if (entity->new_weight < BFQ_MIN_WEIGHT || ++ entity->new_weight > BFQ_MAX_WEIGHT) { ++ printk(KERN_CRIT "update_weight_prio: " ++ "new_weight %d\n", ++ entity->new_weight); ++ BUG(); ++ } ++ entity->orig_weight = entity->new_weight; ++ if (bfqq) ++ bfqq->ioprio = ++ bfq_weight_to_ioprio(entity->orig_weight); ++ } ++ ++ if (bfqq) ++ bfqq->ioprio_class = bfqq->new_ioprio_class; ++ entity->prio_changed = 0; ++ ++ /* ++ * NOTE: here we may be changing the weight too early, ++ * this will cause unfairness. The correct approach ++ * would have required additional complexity to defer ++ * weight changes to the proper time instants (i.e., ++ * when entity->finish <= old_st->vtime). ++ */ ++ new_st = bfq_entity_service_tree(entity); ++ ++ prev_weight = entity->weight; ++ new_weight = entity->orig_weight * ++ (bfqq ? bfqq->wr_coeff : 1); ++ /* ++ * If the weight of the entity changes, remove the entity ++ * from its old weight counter (if there is a counter ++ * associated with the entity), and add it to the counter ++ * associated with its new weight. ++ */ ++ if (prev_weight != new_weight) { ++ root = bfqq ? &bfqd->queue_weights_tree : ++ &bfqd->group_weights_tree; ++ bfq_weights_tree_remove(bfqd, entity, root); ++ } ++ entity->weight = new_weight; ++ /* ++ * Add the entity to its weights tree only if it is ++ * not associated with a weight-raised queue. ++ */ ++ if (prev_weight != new_weight && ++ (bfqq ? bfqq->wr_coeff == 1 : 1)) ++ /* If we get here, root has been initialized. */ ++ bfq_weights_tree_add(bfqd, entity, root); ++ ++ new_st->wsum += entity->weight; ++ ++ if (new_st != old_st) ++ entity->start = new_st->vtime; ++ } ++ ++ return new_st; ++} ++ ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); ++#endif ++ ++/** ++ * bfq_bfqq_served - update the scheduler status after selection for ++ * service. ++ * @bfqq: the queue being served. ++ * @served: bytes to transfer. ++ * ++ * NOTE: this can be optimized, as the timestamps of upper level entities ++ * are synchronized every time a new bfqq is selected for service. By now, ++ * we keep it to better check consistency. ++ */ ++static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st; ++ ++ for_each_entity(entity) { ++ st = bfq_entity_service_tree(entity); ++ ++ entity->service += served; ++ BUG_ON(entity->service > entity->budget); ++ BUG_ON(st->wsum == 0); ++ ++ st->vtime += bfq_delta(served, st->wsum); ++ bfq_forget_idle(st); ++ } ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); ++#endif ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); ++} ++ ++/** ++ * bfq_bfqq_charge_full_budget - set the service to the entity budget. ++ * @bfqq: the queue that needs a service update. ++ * ++ * When it's not possible to be fair in the service domain, because ++ * a queue is not consuming its budget fast enough (the meaning of ++ * fast depends on the timeout parameter), we charge it a full ++ * budget. In this way we should obtain a sort of time-domain ++ * fairness among all the seeky/slow queues. ++ */ ++static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); ++ ++ bfq_bfqq_served(bfqq, entity->budget - entity->service); ++} ++ ++/** ++ * __bfq_activate_entity - activate an entity. ++ * @entity: the entity being activated. ++ * ++ * Called whenever an entity is activated, i.e., it is not active and one ++ * of its children receives a new request, or has to be reactivated due to ++ * budget exhaustion. It uses the current budget of the entity (and the ++ * service received if @entity is active) of the queue to calculate its ++ * timestamps. ++ */ ++static void __bfq_activate_entity(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sd = entity->sched_data; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ ++ if (entity == sd->in_service_entity) { ++ BUG_ON(entity->tree); ++ /* ++ * If we are requeueing the current entity we have ++ * to take care of not charging to it service it has ++ * not received. ++ */ ++ bfq_calc_finish(entity, entity->service); ++ entity->start = entity->finish; ++ sd->in_service_entity = NULL; ++ } else if (entity->tree == &st->active) { ++ /* ++ * Requeueing an entity due to a change of some ++ * next_in_service entity below it. We reuse the ++ * old start time. ++ */ ++ bfq_active_extract(st, entity); ++ } else if (entity->tree == &st->idle) { ++ /* ++ * Must be on the idle tree, bfq_idle_extract() will ++ * check for that. ++ */ ++ bfq_idle_extract(st, entity); ++ entity->start = bfq_gt(st->vtime, entity->finish) ? ++ st->vtime : entity->finish; ++ } else { ++ /* ++ * The finish time of the entity may be invalid, and ++ * it is in the past for sure, otherwise the queue ++ * would have been on the idle tree. ++ */ ++ entity->start = st->vtime; ++ st->wsum += entity->weight; ++ bfq_get_entity(entity); ++ ++ BUG_ON(entity->on_st); ++ entity->on_st = 1; ++ } ++ ++ st = __bfq_entity_update_weight_prio(st, entity); ++ bfq_calc_finish(entity, entity->budget); ++ bfq_active_insert(st, entity); ++} ++ ++/** ++ * bfq_activate_entity - activate an entity and its ancestors if necessary. ++ * @entity: the entity to activate. ++ * ++ * Activate @entity and all the entities on the path from it to the root. ++ */ ++static void bfq_activate_entity(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sd; ++ ++ for_each_entity(entity) { ++ __bfq_activate_entity(entity); ++ ++ sd = entity->sched_data; ++ if (!bfq_update_next_in_service(sd)) ++ /* ++ * No need to propagate the activation to the ++ * upper entities, as they will be updated when ++ * the in-service entity is rescheduled. ++ */ ++ break; ++ } ++} ++ ++/** ++ * __bfq_deactivate_entity - deactivate an entity from its service tree. ++ * @entity: the entity to deactivate. ++ * @requeue: if false, the entity will not be put into the idle tree. ++ * ++ * Deactivate an entity, independently from its previous state. If the ++ * entity was not on a service tree just return, otherwise if it is on ++ * any scheduler tree, extract it from that tree, and if necessary ++ * and if the caller did not specify @requeue, put it on the idle tree. ++ * ++ * Return %1 if the caller should update the entity hierarchy, i.e., ++ * if the entity was in service or if it was the next_in_service for ++ * its sched_data; return %0 otherwise. ++ */ ++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) ++{ ++ struct bfq_sched_data *sd = entity->sched_data; ++ struct bfq_service_tree *st; ++ int was_in_service; ++ int ret = 0; ++ ++ if (sd == NULL || !entity->on_st) /* never activated, or inactive */ ++ return 0; ++ ++ st = bfq_entity_service_tree(entity); ++ was_in_service = entity == sd->in_service_entity; ++ ++ BUG_ON(was_in_service && entity->tree); ++ ++ if (was_in_service) { ++ bfq_calc_finish(entity, entity->service); ++ sd->in_service_entity = NULL; ++ } else if (entity->tree == &st->active) ++ bfq_active_extract(st, entity); ++ else if (entity->tree == &st->idle) ++ bfq_idle_extract(st, entity); ++ else if (entity->tree) ++ BUG(); ++ ++ if (was_in_service || sd->next_in_service == entity) ++ ret = bfq_update_next_in_service(sd); ++ ++ if (!requeue || !bfq_gt(entity->finish, st->vtime)) ++ bfq_forget_entity(st, entity); ++ else ++ bfq_idle_insert(st, entity); ++ ++ BUG_ON(sd->in_service_entity == entity); ++ BUG_ON(sd->next_in_service == entity); ++ ++ return ret; ++} ++ ++/** ++ * bfq_deactivate_entity - deactivate an entity. ++ * @entity: the entity to deactivate. ++ * @requeue: true if the entity can be put on the idle tree ++ */ ++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) ++{ ++ struct bfq_sched_data *sd; ++ struct bfq_entity *parent; ++ ++ for_each_entity_safe(entity, parent) { ++ sd = entity->sched_data; ++ ++ if (!__bfq_deactivate_entity(entity, requeue)) ++ /* ++ * The parent entity is still backlogged, and ++ * we don't need to update it as it is still ++ * in service. ++ */ ++ break; ++ ++ if (sd->next_in_service) ++ /* ++ * The parent entity is still backlogged and ++ * the budgets on the path towards the root ++ * need to be updated. ++ */ ++ goto update; ++ ++ /* ++ * If we reach there the parent is no more backlogged and ++ * we want to propagate the dequeue upwards. ++ */ ++ requeue = 1; ++ } ++ ++ return; ++ ++update: ++ entity = parent; ++ for_each_entity(entity) { ++ __bfq_activate_entity(entity); ++ ++ sd = entity->sched_data; ++ if (!bfq_update_next_in_service(sd)) ++ break; ++ } ++} ++ ++/** ++ * bfq_update_vtime - update vtime if necessary. ++ * @st: the service tree to act upon. ++ * ++ * If necessary update the service tree vtime to have at least one ++ * eligible entity, skipping to its start time. Assumes that the ++ * active tree of the device is not empty. ++ * ++ * NOTE: this hierarchical implementation updates vtimes quite often, ++ * we may end up with reactivated processes getting timestamps after a ++ * vtime skip done because we needed a ->first_active entity on some ++ * intermediate node. ++ */ ++static void bfq_update_vtime(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *entry; ++ struct rb_node *node = st->active.rb_node; ++ ++ entry = rb_entry(node, struct bfq_entity, rb_node); ++ if (bfq_gt(entry->min_start, st->vtime)) { ++ st->vtime = entry->min_start; ++ bfq_forget_idle(st); ++ } ++} ++ ++/** ++ * bfq_first_active_entity - find the eligible entity with ++ * the smallest finish time ++ * @st: the service tree to select from. ++ * ++ * This function searches the first schedulable entity, starting from the ++ * root of the tree and going on the left every time on this side there is ++ * a subtree with at least one eligible (start >= vtime) entity. The path on ++ * the right is followed only if a) the left subtree contains no eligible ++ * entities and b) no eligible entity has been found yet. ++ */ ++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *entry, *first = NULL; ++ struct rb_node *node = st->active.rb_node; ++ ++ while (node) { ++ entry = rb_entry(node, struct bfq_entity, rb_node); ++left: ++ if (!bfq_gt(entry->start, st->vtime)) ++ first = entry; ++ ++ BUG_ON(bfq_gt(entry->min_start, st->vtime)); ++ ++ if (node->rb_left) { ++ entry = rb_entry(node->rb_left, ++ struct bfq_entity, rb_node); ++ if (!bfq_gt(entry->min_start, st->vtime)) { ++ node = node->rb_left; ++ goto left; ++ } ++ } ++ if (first) ++ break; ++ node = node->rb_right; ++ } ++ ++ BUG_ON(!first && !RB_EMPTY_ROOT(&st->active)); ++ return first; ++} ++ ++/** ++ * __bfq_lookup_next_entity - return the first eligible entity in @st. ++ * @st: the service tree. ++ * ++ * Update the virtual time in @st and return the first eligible entity ++ * it contains. ++ */ ++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, ++ bool force) ++{ ++ struct bfq_entity *entity, *new_next_in_service = NULL; ++ ++ if (RB_EMPTY_ROOT(&st->active)) ++ return NULL; ++ ++ bfq_update_vtime(st); ++ entity = bfq_first_active_entity(st); ++ BUG_ON(bfq_gt(entity->start, st->vtime)); ++ ++ /* ++ * If the chosen entity does not match with the sched_data's ++ * next_in_service and we are forcedly serving the IDLE priority ++ * class tree, bubble up budget update. ++ */ ++ if (unlikely(force && entity != entity->sched_data->next_in_service)) { ++ new_next_in_service = entity; ++ for_each_entity(new_next_in_service) ++ bfq_update_budget(new_next_in_service); ++ } ++ ++ return entity; ++} ++ ++/** ++ * bfq_lookup_next_entity - return the first eligible entity in @sd. ++ * @sd: the sched_data. ++ * @extract: if true the returned entity will be also extracted from @sd. ++ * ++ * NOTE: since we cache the next_in_service entity at each level of the ++ * hierarchy, the complexity of the lookup can be decreased with ++ * absolutely no effort just returning the cached next_in_service value; ++ * we prefer to do full lookups to test the consistency of * the data ++ * structures. ++ */ ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, ++ int extract, ++ struct bfq_data *bfqd) ++{ ++ struct bfq_service_tree *st = sd->service_tree; ++ struct bfq_entity *entity; ++ int i = 0; ++ ++ BUG_ON(sd->in_service_entity); ++ ++ if (bfqd && ++ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { ++ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, ++ true); ++ if (entity) { ++ i = BFQ_IOPRIO_CLASSES - 1; ++ bfqd->bfq_class_idle_last_service = jiffies; ++ sd->next_in_service = entity; ++ } ++ } ++ for (; i < BFQ_IOPRIO_CLASSES; i++) { ++ entity = __bfq_lookup_next_entity(st + i, false); ++ if (entity) { ++ if (extract) { ++ bfq_check_next_in_service(sd, entity); ++ bfq_active_extract(st + i, entity); ++ sd->in_service_entity = entity; ++ sd->next_in_service = NULL; ++ } ++ break; ++ } ++ } ++ ++ return entity; ++} ++ ++/* ++ * Get next queue for service. ++ */ ++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_entity *entity = NULL; ++ struct bfq_sched_data *sd; ++ struct bfq_queue *bfqq; ++ ++ BUG_ON(bfqd->in_service_queue); ++ ++ if (bfqd->busy_queues == 0) ++ return NULL; ++ ++ sd = &bfqd->root_group->sched_data; ++ for (; sd ; sd = entity->my_sched_data) { ++ entity = bfq_lookup_next_entity(sd, 1, bfqd); ++ BUG_ON(!entity); ++ entity->service = 0; ++ } ++ ++ bfqq = bfq_entity_to_bfqq(entity); ++ BUG_ON(!bfqq); ++ ++ return bfqq; ++} ++ ++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) ++{ ++ if (bfqd->in_service_bic) { ++ put_io_context(bfqd->in_service_bic->icq.ioc); ++ bfqd->in_service_bic = NULL; ++ } ++ ++ bfqd->in_service_queue = NULL; ++ del_timer(&bfqd->idle_slice_timer); ++} ++ ++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ int requeue) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (bfqq == bfqd->in_service_queue) ++ __bfq_bfqd_reset_in_service(bfqd); ++ ++ bfq_deactivate_entity(entity, requeue); ++} ++ ++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ bfq_activate_entity(entity); ++} ++ ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); ++#endif ++ ++/* ++ * Called when the bfqq no longer has requests pending, remove it from ++ * the service tree. ++ */ ++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ int requeue) ++{ ++ BUG_ON(!bfq_bfqq_busy(bfqq)); ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ bfq_log_bfqq(bfqd, bfqq, "del from busy"); ++ ++ bfq_clear_bfqq_busy(bfqq); ++ ++ BUG_ON(bfqd->busy_queues == 0); ++ bfqd->busy_queues--; ++ ++ if (!bfqq->dispatched) { ++ bfq_weights_tree_remove(bfqd, &bfqq->entity, ++ &bfqd->queue_weights_tree); ++ if (!blk_queue_nonrot(bfqd->queue)) { ++ BUG_ON(!bfqd->busy_in_flight_queues); ++ bfqd->busy_in_flight_queues--; ++ if (bfq_bfqq_constantly_seeky(bfqq)) { ++ BUG_ON(!bfqd-> ++ const_seeky_busy_in_flight_queues); ++ bfqd->const_seeky_busy_in_flight_queues--; ++ } ++ } ++ } ++ if (bfqq->wr_coeff > 1) ++ bfqd->wr_busy_queues--; ++ ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ bfqg_stats_update_dequeue(bfqq_group(bfqq)); ++#endif ++ ++ bfq_deactivate_bfqq(bfqd, bfqq, requeue); ++} ++ ++/* ++ * Called when an inactive queue receives a new request. ++ */ ++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ BUG_ON(bfq_bfqq_busy(bfqq)); ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ ++ bfq_log_bfqq(bfqd, bfqq, "add to busy"); ++ ++ bfq_activate_bfqq(bfqd, bfqq); ++ ++ bfq_mark_bfqq_busy(bfqq); ++ bfqd->busy_queues++; ++ ++ if (!bfqq->dispatched) { ++ if (bfqq->wr_coeff == 1) ++ bfq_weights_tree_add(bfqd, &bfqq->entity, ++ &bfqd->queue_weights_tree); ++ if (!blk_queue_nonrot(bfqd->queue)) { ++ bfqd->busy_in_flight_queues++; ++ if (bfq_bfqq_constantly_seeky(bfqq)) ++ bfqd->const_seeky_busy_in_flight_queues++; ++ } ++ } ++ if (bfqq->wr_coeff > 1) ++ bfqd->wr_busy_queues++; ++} +diff --git a/block/bfq.h b/block/bfq.h +new file mode 100644 +index 0000000..3bb7df2 +--- /dev/null ++++ b/block/bfq.h +@@ -0,0 +1,801 @@ ++/* ++ * BFQ-v7r11 for 4.5.0: data structures and common functions prototypes. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2010 Paolo Valente ++ */ ++ ++#ifndef _BFQ_H ++#define _BFQ_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#define BFQ_IOPRIO_CLASSES 3 ++#define BFQ_CL_IDLE_TIMEOUT (HZ/5) ++ ++#define BFQ_MIN_WEIGHT 1 ++#define BFQ_MAX_WEIGHT 1000 ++#define BFQ_WEIGHT_CONVERSION_COEFF 10 ++ ++#define BFQ_DEFAULT_QUEUE_IOPRIO 4 ++ ++#define BFQ_DEFAULT_GRP_WEIGHT 10 ++#define BFQ_DEFAULT_GRP_IOPRIO 0 ++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE ++ ++struct bfq_entity; ++ ++/** ++ * struct bfq_service_tree - per ioprio_class service tree. ++ * @active: tree for active entities (i.e., those backlogged). ++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). ++ * @first_idle: idle entity with minimum F_i. ++ * @last_idle: idle entity with maximum F_i. ++ * @vtime: scheduler virtual time. ++ * @wsum: scheduler weight sum; active and idle entities contribute to it. ++ * ++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each ++ * ioprio_class has its own independent scheduler, and so its own ++ * bfq_service_tree. All the fields are protected by the queue lock ++ * of the containing bfqd. ++ */ ++struct bfq_service_tree { ++ struct rb_root active; ++ struct rb_root idle; ++ ++ struct bfq_entity *first_idle; ++ struct bfq_entity *last_idle; ++ ++ u64 vtime; ++ unsigned long wsum; ++}; ++ ++/** ++ * struct bfq_sched_data - multi-class scheduler. ++ * @in_service_entity: entity in service. ++ * @next_in_service: head-of-the-line entity in the scheduler. ++ * @service_tree: array of service trees, one per ioprio_class. ++ * ++ * bfq_sched_data is the basic scheduler queue. It supports three ++ * ioprio_classes, and can be used either as a toplevel queue or as ++ * an intermediate queue on a hierarchical setup. ++ * @next_in_service points to the active entity of the sched_data ++ * service trees that will be scheduled next. ++ * ++ * The supported ioprio_classes are the same as in CFQ, in descending ++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. ++ * Requests from higher priority queues are served before all the ++ * requests from lower priority queues; among requests of the same ++ * queue requests are served according to B-WF2Q+. ++ * All the fields are protected by the queue lock of the containing bfqd. ++ */ ++struct bfq_sched_data { ++ struct bfq_entity *in_service_entity; ++ struct bfq_entity *next_in_service; ++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; ++}; ++ ++/** ++ * struct bfq_weight_counter - counter of the number of all active entities ++ * with a given weight. ++ * @weight: weight of the entities that this counter refers to. ++ * @num_active: number of active entities with this weight. ++ * @weights_node: weights tree member (see bfq_data's @queue_weights_tree ++ * and @group_weights_tree). ++ */ ++struct bfq_weight_counter { ++ short int weight; ++ unsigned int num_active; ++ struct rb_node weights_node; ++}; ++ ++/** ++ * struct bfq_entity - schedulable entity. ++ * @rb_node: service_tree member. ++ * @weight_counter: pointer to the weight counter associated with this entity. ++ * @on_st: flag, true if the entity is on a tree (either the active or ++ * the idle one of its service_tree). ++ * @finish: B-WF2Q+ finish timestamp (aka F_i). ++ * @start: B-WF2Q+ start timestamp (aka S_i). ++ * @tree: tree the entity is enqueued into; %NULL if not on a tree. ++ * @min_start: minimum start time of the (active) subtree rooted at ++ * this entity; used for O(log N) lookups into active trees. ++ * @service: service received during the last round of service. ++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. ++ * @weight: weight of the queue ++ * @parent: parent entity, for hierarchical scheduling. ++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the ++ * associated scheduler queue, %NULL on leaf nodes. ++ * @sched_data: the scheduler queue this entity belongs to. ++ * @ioprio: the ioprio in use. ++ * @new_weight: when a weight change is requested, the new weight value. ++ * @orig_weight: original weight, used to implement weight boosting ++ * @prio_changed: flag, true when the user requested a weight, ioprio or ++ * ioprio_class change. ++ * ++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the ++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each ++ * entity belongs to the sched_data of the parent group in the cgroup ++ * hierarchy. Non-leaf entities have also their own sched_data, stored ++ * in @my_sched_data. ++ * ++ * Each entity stores independently its priority values; this would ++ * allow different weights on different devices, but this ++ * functionality is not exported to userspace by now. Priorities and ++ * weights are updated lazily, first storing the new values into the ++ * new_* fields, then setting the @prio_changed flag. As soon as ++ * there is a transition in the entity state that allows the priority ++ * update to take place the effective and the requested priority ++ * values are synchronized. ++ * ++ * Unless cgroups are used, the weight value is calculated from the ++ * ioprio to export the same interface as CFQ. When dealing with ++ * ``well-behaved'' queues (i.e., queues that do not spend too much ++ * time to consume their budget and have true sequential behavior, and ++ * when there are no external factors breaking anticipation) the ++ * relative weights at each level of the cgroups hierarchy should be ++ * guaranteed. All the fields are protected by the queue lock of the ++ * containing bfqd. ++ */ ++struct bfq_entity { ++ struct rb_node rb_node; ++ struct bfq_weight_counter *weight_counter; ++ ++ int on_st; ++ ++ u64 finish; ++ u64 start; ++ ++ struct rb_root *tree; ++ ++ u64 min_start; ++ ++ int service, budget; ++ unsigned short weight, new_weight; ++ unsigned short orig_weight; ++ ++ struct bfq_entity *parent; ++ ++ struct bfq_sched_data *my_sched_data; ++ struct bfq_sched_data *sched_data; ++ ++ int prio_changed; ++}; ++ ++struct bfq_group; ++ ++/** ++ * struct bfq_queue - leaf schedulable entity. ++ * @ref: reference counter. ++ * @bfqd: parent bfq_data. ++ * @new_ioprio: when an ioprio change is requested, the new ioprio value. ++ * @ioprio_class: the ioprio_class in use. ++ * @new_ioprio_class: when an ioprio_class change is requested, the new ++ * ioprio_class value. ++ * @new_bfqq: shared bfq_queue if queue is cooperating with ++ * one or more other queues. ++ * @sort_list: sorted list of pending requests. ++ * @next_rq: if fifo isn't expired, next request to serve. ++ * @queued: nr of requests queued in @sort_list. ++ * @allocated: currently allocated requests. ++ * @meta_pending: pending metadata requests. ++ * @fifo: fifo list of requests in sort_list. ++ * @entity: entity representing this queue in the scheduler. ++ * @max_budget: maximum budget allowed from the feedback mechanism. ++ * @budget_timeout: budget expiration (in jiffies). ++ * @dispatched: number of requests on the dispatch list or inside driver. ++ * @flags: status flags. ++ * @bfqq_list: node for active/idle bfqq list inside our bfqd. ++ * @burst_list_node: node for the device's burst list. ++ * @seek_samples: number of seeks sampled ++ * @seek_total: sum of the distances of the seeks sampled ++ * @seek_mean: mean seek distance ++ * @last_request_pos: position of the last request enqueued ++ * @requests_within_timer: number of consecutive pairs of request completion ++ * and arrival, such that the queue becomes idle ++ * after the completion, but the next request arrives ++ * within an idle time slice; used only if the queue's ++ * IO_bound has been cleared. ++ * @pid: pid of the process owning the queue, used for logging purposes. ++ * @last_wr_start_finish: start time of the current weight-raising period if ++ * the @bfq-queue is being weight-raised, otherwise ++ * finish time of the last weight-raising period ++ * @wr_cur_max_time: current max raising time for this queue ++ * @soft_rt_next_start: minimum time instant such that, only if a new ++ * request is enqueued after this time instant in an ++ * idle @bfq_queue with no outstanding requests, then ++ * the task associated with the queue it is deemed as ++ * soft real-time (see the comments to the function ++ * bfq_bfqq_softrt_next_start()) ++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from ++ * idle to backlogged ++ * @service_from_backlogged: cumulative service received from the @bfq_queue ++ * since the last transition from idle to ++ * backlogged ++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the ++ * queue is shared ++ * ++ * A bfq_queue is a leaf request queue; it can be associated with an ++ * io_context or more, if it is async or shared between cooperating ++ * processes. @cgroup holds a reference to the cgroup, to be sure that it ++ * does not disappear while a bfqq still references it (mostly to avoid ++ * races between request issuing and task migration followed by cgroup ++ * destruction). ++ * All the fields are protected by the queue lock of the containing bfqd. ++ */ ++struct bfq_queue { ++ atomic_t ref; ++ struct bfq_data *bfqd; ++ ++ unsigned short ioprio, new_ioprio; ++ unsigned short ioprio_class, new_ioprio_class; ++ ++ /* fields for cooperating queues handling */ ++ struct bfq_queue *new_bfqq; ++ struct rb_node pos_node; ++ struct rb_root *pos_root; ++ ++ struct rb_root sort_list; ++ struct request *next_rq; ++ int queued[2]; ++ int allocated[2]; ++ int meta_pending; ++ struct list_head fifo; ++ ++ struct bfq_entity entity; ++ ++ int max_budget; ++ unsigned long budget_timeout; ++ ++ int dispatched; ++ ++ unsigned int flags; ++ ++ struct list_head bfqq_list; ++ ++ struct hlist_node burst_list_node; ++ ++ unsigned int seek_samples; ++ u64 seek_total; ++ sector_t seek_mean; ++ sector_t last_request_pos; ++ ++ unsigned int requests_within_timer; ++ ++ pid_t pid; ++ struct bfq_io_cq *bic; ++ ++ /* weight-raising fields */ ++ unsigned long wr_cur_max_time; ++ unsigned long soft_rt_next_start; ++ unsigned long last_wr_start_finish; ++ unsigned int wr_coeff; ++ unsigned long last_idle_bklogged; ++ unsigned long service_from_backlogged; ++}; ++ ++/** ++ * struct bfq_ttime - per process thinktime stats. ++ * @ttime_total: total process thinktime ++ * @ttime_samples: number of thinktime samples ++ * @ttime_mean: average process thinktime ++ */ ++struct bfq_ttime { ++ unsigned long last_end_request; ++ ++ unsigned long ttime_total; ++ unsigned long ttime_samples; ++ unsigned long ttime_mean; ++}; ++ ++/** ++ * struct bfq_io_cq - per (request_queue, io_context) structure. ++ * @icq: associated io_cq structure ++ * @bfqq: array of two process queues, the sync and the async ++ * @ttime: associated @bfq_ttime struct ++ * @ioprio: per (request_queue, blkcg) ioprio. ++ * @blkcg_id: id of the blkcg the related io_cq belongs to. ++ */ ++struct bfq_io_cq { ++ struct io_cq icq; /* must be the first member */ ++ struct bfq_queue *bfqq[2]; ++ struct bfq_ttime ttime; ++ int ioprio; ++ ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ uint64_t blkcg_id; /* the current blkcg ID */ ++#endif ++}; ++ ++enum bfq_device_speed { ++ BFQ_BFQD_FAST, ++ BFQ_BFQD_SLOW, ++}; ++ ++/** ++ * struct bfq_data - per device data structure. ++ * @queue: request queue for the managed device. ++ * @root_group: root bfq_group for the device. ++ * @active_numerous_groups: number of bfq_groups containing more than one ++ * active @bfq_entity. ++ * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by ++ * weight. Used to keep track of whether all @bfq_queues ++ * have the same weight. The tree contains one counter ++ * for each distinct weight associated to some active ++ * and not weight-raised @bfq_queue (see the comments to ++ * the functions bfq_weights_tree_[add|remove] for ++ * further details). ++ * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted ++ * by weight. Used to keep track of whether all ++ * @bfq_groups have the same weight. The tree contains ++ * one counter for each distinct weight associated to ++ * some active @bfq_group (see the comments to the ++ * functions bfq_weights_tree_[add|remove] for further ++ * details). ++ * @busy_queues: number of bfq_queues containing requests (including the ++ * queue in service, even if it is idling). ++ * @busy_in_flight_queues: number of @bfq_queues containing pending or ++ * in-flight requests, plus the @bfq_queue in ++ * service, even if idle but waiting for the ++ * possible arrival of its next sync request. This ++ * field is updated only if the device is rotational, ++ * but used only if the device is also NCQ-capable. ++ * The reason why the field is updated also for non- ++ * NCQ-capable rotational devices is related to the ++ * fact that the value of @hw_tag may be set also ++ * later than when busy_in_flight_queues may need to ++ * be incremented for the first time(s). Taking also ++ * this possibility into account, to avoid unbalanced ++ * increments/decrements, would imply more overhead ++ * than just updating busy_in_flight_queues ++ * regardless of the value of @hw_tag. ++ * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues ++ * (that is, seeky queues that expired ++ * for budget timeout at least once) ++ * containing pending or in-flight ++ * requests, including the in-service ++ * @bfq_queue if constantly seeky. This ++ * field is updated only if the device ++ * is rotational, but used only if the ++ * device is also NCQ-capable (see the ++ * comments to @busy_in_flight_queues). ++ * @wr_busy_queues: number of weight-raised busy @bfq_queues. ++ * @queued: number of queued requests. ++ * @rq_in_driver: number of requests dispatched and waiting for completion. ++ * @sync_flight: number of sync requests in the driver. ++ * @max_rq_in_driver: max number of reqs in driver in the last ++ * @hw_tag_samples completed requests. ++ * @hw_tag_samples: nr of samples used to calculate hw_tag. ++ * @hw_tag: flag set to one if the driver is showing a queueing behavior. ++ * @budgets_assigned: number of budgets assigned. ++ * @idle_slice_timer: timer set when idling for the next sequential request ++ * from the queue in service. ++ * @unplug_work: delayed work to restart dispatching on the request queue. ++ * @in_service_queue: bfq_queue in service. ++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. ++ * @last_position: on-disk position of the last served request. ++ * @last_budget_start: beginning of the last budget. ++ * @last_idling_start: beginning of the last idle slice. ++ * @peak_rate: peak transfer rate observed for a budget. ++ * @peak_rate_samples: number of samples used to calculate @peak_rate. ++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before ++ * rescheduling. ++ * @active_list: list of all the bfq_queues active on the device. ++ * @idle_list: list of all the bfq_queues idle on the device. ++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires ++ * requests are served in fifo order. ++ * @bfq_back_penalty: weight of backward seeks wrt forward ones. ++ * @bfq_back_max: maximum allowed backward seek. ++ * @bfq_slice_idle: maximum idling time. ++ * @bfq_user_max_budget: user-configured max budget value ++ * (0 for auto-tuning). ++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to ++ * async queues. ++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to ++ * to prevent seeky queues to impose long latencies to well ++ * behaved ones (this also implies that seeky queues cannot ++ * receive guarantees in the service domain; after a timeout ++ * they are charged for the whole allocated budget, to try ++ * to preserve a behavior reasonably fair among them, but ++ * without service-domain guarantees). ++ * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is ++ * no more granted any weight-raising. ++ * @bfq_failed_cooperations: number of consecutive failed cooperation ++ * chances after which weight-raising is restored ++ * to a queue subject to more than bfq_coop_thresh ++ * queue merges. ++ * @bfq_requests_within_timer: number of consecutive requests that must be ++ * issued within the idle time slice to set ++ * again idling to a queue which was marked as ++ * non-I/O-bound (see the definition of the ++ * IO_bound flag for further details). ++ * @last_ins_in_burst: last time at which a queue entered the current ++ * burst of queues being activated shortly after ++ * each other; for more details about this and the ++ * following parameters related to a burst of ++ * activations, see the comments to the function ++ * @bfq_handle_burst. ++ * @bfq_burst_interval: reference time interval used to decide whether a ++ * queue has been activated shortly after ++ * @last_ins_in_burst. ++ * @burst_size: number of queues in the current burst of queue activations. ++ * @bfq_large_burst_thresh: maximum burst size above which the current ++ * queue-activation burst is deemed as 'large'. ++ * @large_burst: true if a large queue-activation burst is in progress. ++ * @burst_list: head of the burst list (as for the above fields, more details ++ * in the comments to the function bfq_handle_burst). ++ * @low_latency: if set to true, low-latency heuristics are enabled. ++ * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised ++ * queue is multiplied. ++ * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies). ++ * @bfq_wr_rt_max_time: maximum duration for soft real-time processes. ++ * @bfq_wr_min_idle_time: minimum idle period after which weight-raising ++ * may be reactivated for a queue (in jiffies). ++ * @bfq_wr_min_inter_arr_async: minimum period between request arrivals ++ * after which weight-raising may be ++ * reactivated for an already busy queue ++ * (in jiffies). ++ * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue, ++ * sectors per seconds. ++ * @RT_prod: cached value of the product R*T used for computing the maximum ++ * duration of the weight raising automatically. ++ * @device_speed: device-speed class for the low-latency heuristic. ++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions. ++ * ++ * All the fields are protected by the @queue lock. ++ */ ++struct bfq_data { ++ struct request_queue *queue; ++ ++ struct bfq_group *root_group; ++ ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ int active_numerous_groups; ++#endif ++ ++ struct rb_root queue_weights_tree; ++ struct rb_root group_weights_tree; ++ ++ int busy_queues; ++ int busy_in_flight_queues; ++ int const_seeky_busy_in_flight_queues; ++ int wr_busy_queues; ++ int queued; ++ int rq_in_driver; ++ int sync_flight; ++ ++ int max_rq_in_driver; ++ int hw_tag_samples; ++ int hw_tag; ++ ++ int budgets_assigned; ++ ++ struct timer_list idle_slice_timer; ++ struct work_struct unplug_work; ++ ++ struct bfq_queue *in_service_queue; ++ struct bfq_io_cq *in_service_bic; ++ ++ sector_t last_position; ++ ++ ktime_t last_budget_start; ++ ktime_t last_idling_start; ++ int peak_rate_samples; ++ u64 peak_rate; ++ int bfq_max_budget; ++ ++ struct list_head active_list; ++ struct list_head idle_list; ++ ++ unsigned int bfq_fifo_expire[2]; ++ unsigned int bfq_back_penalty; ++ unsigned int bfq_back_max; ++ unsigned int bfq_slice_idle; ++ u64 bfq_class_idle_last_service; ++ ++ int bfq_user_max_budget; ++ int bfq_max_budget_async_rq; ++ unsigned int bfq_timeout[2]; ++ ++ unsigned int bfq_coop_thresh; ++ unsigned int bfq_failed_cooperations; ++ unsigned int bfq_requests_within_timer; ++ ++ unsigned long last_ins_in_burst; ++ unsigned long bfq_burst_interval; ++ int burst_size; ++ unsigned long bfq_large_burst_thresh; ++ bool large_burst; ++ struct hlist_head burst_list; ++ ++ bool low_latency; ++ ++ /* parameters of the low_latency heuristics */ ++ unsigned int bfq_wr_coeff; ++ unsigned int bfq_wr_max_time; ++ unsigned int bfq_wr_rt_max_time; ++ unsigned int bfq_wr_min_idle_time; ++ unsigned long bfq_wr_min_inter_arr_async; ++ unsigned int bfq_wr_max_softrt_rate; ++ u64 RT_prod; ++ enum bfq_device_speed device_speed; ++ ++ struct bfq_queue oom_bfqq; ++}; ++ ++enum bfqq_state_flags { ++ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ ++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ ++ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ ++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ ++ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ ++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ ++ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ ++ BFQ_BFQQ_FLAG_IO_bound, /* ++ * bfqq has timed-out at least once ++ * having consumed at most 2/10 of ++ * its budget ++ */ ++ BFQ_BFQQ_FLAG_in_large_burst, /* ++ * bfqq activated in a large burst, ++ * see comments to bfq_handle_burst. ++ */ ++ BFQ_BFQQ_FLAG_constantly_seeky, /* ++ * bfqq has proved to be slow and ++ * seeky until budget timeout ++ */ ++ BFQ_BFQQ_FLAG_softrt_update, /* ++ * may need softrt-next-start ++ * update ++ */ ++}; ++ ++#define BFQ_BFQQ_FNS(name) \ ++static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ ++{ \ ++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ ++} ++ ++BFQ_BFQQ_FNS(busy); ++BFQ_BFQQ_FNS(wait_request); ++BFQ_BFQQ_FNS(must_alloc); ++BFQ_BFQQ_FNS(fifo_expire); ++BFQ_BFQQ_FNS(idle_window); ++BFQ_BFQQ_FNS(sync); ++BFQ_BFQQ_FNS(budget_new); ++BFQ_BFQQ_FNS(IO_bound); ++BFQ_BFQQ_FNS(in_large_burst); ++BFQ_BFQQ_FNS(constantly_seeky); ++BFQ_BFQQ_FNS(softrt_update); ++#undef BFQ_BFQQ_FNS ++ ++/* Logging facilities. */ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) ++ ++#define bfq_log(bfqd, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) ++ ++/* Expiration reasons. */ ++enum bfqq_expiration { ++ BFQ_BFQQ_TOO_IDLE = 0, /* ++ * queue has been idling for ++ * too long ++ */ ++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ ++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ ++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ ++}; ++ ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ ++struct bfqg_stats { ++ /* total bytes transferred */ ++ struct blkg_rwstat service_bytes; ++ /* total IOs serviced, post merge */ ++ struct blkg_rwstat serviced; ++ /* number of ios merged */ ++ struct blkg_rwstat merged; ++ /* total time spent on device in ns, may not be accurate w/ queueing */ ++ struct blkg_rwstat service_time; ++ /* total time spent waiting in scheduler queue in ns */ ++ struct blkg_rwstat wait_time; ++ /* number of IOs queued up */ ++ struct blkg_rwstat queued; ++ /* total sectors transferred */ ++ struct blkg_stat sectors; ++ /* total disk time and nr sectors dispatched by this group */ ++ struct blkg_stat time; ++ /* time not charged to this cgroup */ ++ struct blkg_stat unaccounted_time; ++ /* sum of number of ios queued across all samples */ ++ struct blkg_stat avg_queue_size_sum; ++ /* count of samples taken for average */ ++ struct blkg_stat avg_queue_size_samples; ++ /* how many times this group has been removed from service tree */ ++ struct blkg_stat dequeue; ++ /* total time spent waiting for it to be assigned a timeslice. */ ++ struct blkg_stat group_wait_time; ++ /* time spent idling for this blkcg_gq */ ++ struct blkg_stat idle_time; ++ /* total time with empty current active q with other requests queued */ ++ struct blkg_stat empty_time; ++ /* fields after this shouldn't be cleared on stat reset */ ++ uint64_t start_group_wait_time; ++ uint64_t start_idle_time; ++ uint64_t start_empty_time; ++ uint16_t flags; ++}; ++ ++/* ++ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. ++ * ++ * @ps: @blkcg_policy_storage that this structure inherits ++ * @weight: weight of the bfq_group ++ */ ++struct bfq_group_data { ++ /* must be the first member */ ++ struct blkcg_policy_data pd; ++ ++ unsigned short weight; ++}; ++ ++/** ++ * struct bfq_group - per (device, cgroup) data structure. ++ * @entity: schedulable entity to insert into the parent group sched_data. ++ * @sched_data: own sched_data, to contain child entities (they may be ++ * both bfq_queues and bfq_groups). ++ * @bfqd: the bfq_data for the device this group acts upon. ++ * @async_bfqq: array of async queues for all the tasks belonging to ++ * the group, one queue per ioprio value per ioprio_class, ++ * except for the idle class that has only one queue. ++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). ++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used ++ * to avoid too many special cases during group creation/ ++ * migration. ++ * @active_entities: number of active entities belonging to the group; ++ * unused for the root group. Used to know whether there ++ * are groups with more than one active @bfq_entity ++ * (see the comments to the function ++ * bfq_bfqq_must_not_expire()). ++ * ++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup ++ * there is a set of bfq_groups, each one collecting the lower-level ++ * entities belonging to the group that are acting on the same device. ++ * ++ * Locking works as follows: ++ * o @bfqd is protected by the queue lock, RCU is used to access it ++ * from the readers. ++ * o All the other fields are protected by the @bfqd queue lock. ++ */ ++struct bfq_group { ++ /* must be the first member */ ++ struct blkg_policy_data pd; ++ ++ struct bfq_entity entity; ++ struct bfq_sched_data sched_data; ++ ++ void *bfqd; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++ ++ struct bfq_entity *my_entity; ++ ++ int active_entities; ++ ++ struct bfqg_stats stats; ++ struct bfqg_stats dead_stats; /* stats pushed from dead children */ ++}; ++ ++#else ++struct bfq_group { ++ struct bfq_sched_data sched_data; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++}; ++#endif ++ ++static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); ++ ++static struct bfq_service_tree * ++bfq_entity_service_tree(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sched_data = entity->sched_data; ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ unsigned int idx = bfqq ? bfqq->ioprio_class - 1 : ++ BFQ_DEFAULT_GRP_CLASS; ++ ++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); ++ BUG_ON(sched_data == NULL); ++ ++ return sched_data->service_tree + idx; ++} ++ ++static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) ++{ ++ return bic->bfqq[is_sync]; ++} ++ ++static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, ++ bool is_sync) ++{ ++ bic->bfqq[is_sync] = bfqq; ++} ++ ++static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) ++{ ++ return bic->icq.q->elevator->elevator_data; ++} ++ ++/** ++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. ++ * @ptr: a pointer to a bfqd. ++ * @flags: storage for the flags to be saved. ++ * ++ * This function allows bfqg->bfqd to be protected by the ++ * queue lock of the bfqd they reference; the pointer is dereferenced ++ * under RCU, so the storage for bfqd is assured to be safe as long ++ * as the RCU read side critical section does not end. After the ++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be ++ * sure that no other writer accessed it. If we raced with a writer, ++ * the function returns NULL, with the queue unlocked, otherwise it ++ * returns the dereferenced pointer, with the queue locked. ++ */ ++static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags) ++{ ++ struct bfq_data *bfqd; ++ ++ rcu_read_lock(); ++ bfqd = rcu_dereference(*(struct bfq_data **)ptr); ++ ++ if (bfqd != NULL) { ++ spin_lock_irqsave(bfqd->queue->queue_lock, *flags); ++ if (ptr == NULL) ++ printk(KERN_CRIT "get_bfqd_locked pointer NULL\n"); ++ else if (*ptr == bfqd) ++ goto out; ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); ++ } ++ ++ bfqd = NULL; ++out: ++ rcu_read_unlock(); ++ return bfqd; ++} ++ ++static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) ++{ ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); ++} ++ ++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); ++static void bfq_put_queue(struct bfq_queue *bfqq); ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bio *bio, int is_sync, ++ struct bfq_io_cq *bic, gfp_t gfp_mask); ++static void bfq_end_wr_async_queues(struct bfq_data *bfqd, ++ struct bfq_group *bfqg); ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); ++ ++#endif /* _BFQ_H */ +-- +1.9.1 + diff --git a/0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for.patch b/0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for.patch new file mode 100644 index 0000000..c2abd3a --- /dev/null +++ b/0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for.patch @@ -0,0 +1,1101 @@ +From d3deade9dc903f58c2bf79e316b785f6eaf2441f Mon Sep 17 00:00:00 2001 +From: Mauro Andreolini +Date: Sun, 6 Sep 2015 16:09:05 +0200 +Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r11 for + 4.5.0 + +A set of processes may happen to perform interleaved reads, i.e.,requests +whose union would give rise to a sequential read pattern. There are two +typical cases: in the first case, processes read fixed-size chunks of +data at a fixed distance from each other, while in the second case processes +may read variable-size chunks at variable distances. The latter case occurs +for example with QEMU, which splits the I/O generated by the guest into +multiple chunks, and lets these chunks be served by a pool of cooperating +processes, iteratively assigning the next chunk of I/O to the first +available process. CFQ uses actual queue merging for the first type of +rocesses, whereas it uses preemption to get a sequential read pattern out +of the read requests performed by the second type of processes. In the end +it uses two different mechanisms to achieve the same goal: boosting the +throughput with interleaved I/O. + +This patch introduces Early Queue Merge (EQM), a unified mechanism to get a +sequential read pattern with both types of processes. The main idea is +checking newly arrived requests against the next request of the active queue +both in case of actual request insert and in case of request merge. By doing +so, both the types of processes can be handled by just merging their queues. +EQM is then simpler and more compact than the pair of mechanisms used in +CFQ. + +Finally, EQM also preserves the typical low-latency properties of BFQ, by +properly restoring the weight-raising state of a queue when it gets back to +a non-merged state. + +Signed-off-by: Mauro Andreolini +Signed-off-by: Arianna Avanzini +Signed-off-by: Paolo Valente +Signed-off-by: Linus Walleij +--- + block/bfq-cgroup.c | 4 + + block/bfq-iosched.c | 687 ++++++++++++++++++++++++++++++++++++++++++++++++++-- + block/bfq.h | 66 +++++ + 3 files changed, 743 insertions(+), 14 deletions(-) + +diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c +index 8610cd6..5ee99ec 100644 +--- a/block/bfq-cgroup.c ++++ b/block/bfq-cgroup.c +@@ -437,6 +437,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) + */ + bfqg->bfqd = bfqd; + bfqg->active_entities = 0; ++ bfqg->rq_pos_tree = RB_ROOT; + } + + static void bfq_pd_free(struct blkg_policy_data *pd) +@@ -530,6 +531,8 @@ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, + return bfqg; + } + ++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); ++ + /** + * bfq_bfqq_move - migrate @bfqq to @bfqg. + * @bfqd: queue descriptor. +@@ -577,6 +580,7 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bfqg_get(bfqg); + + if (busy) { ++ bfq_pos_tree_add_move(bfqd, bfqq); + if (resume) + bfq_activate_bfqq(bfqd, bfqq); + } +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c +index f9787a6..d1f648d 100644 +--- a/block/bfq-iosched.c ++++ b/block/bfq-iosched.c +@@ -296,6 +296,72 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, + } + } + ++static struct bfq_queue * ++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, ++ sector_t sector, struct rb_node **ret_parent, ++ struct rb_node ***rb_link) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *bfqq = NULL; ++ ++ parent = NULL; ++ p = &root->rb_node; ++ while (*p) { ++ struct rb_node **n; ++ ++ parent = *p; ++ bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ ++ /* ++ * Sort strictly based on sector. Smallest to the left, ++ * largest to the right. ++ */ ++ if (sector > blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_right; ++ else if (sector < blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_left; ++ else ++ break; ++ p = n; ++ bfqq = NULL; ++ } ++ ++ *ret_parent = parent; ++ if (rb_link) ++ *rb_link = p; ++ ++ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", ++ (long long unsigned)sector, ++ bfqq ? bfqq->pid : 0); ++ ++ return bfqq; ++} ++ ++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *__bfqq; ++ ++ if (bfqq->pos_root) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ ++ if (bfq_class_idle(bfqq)) ++ return; ++ if (!bfqq->next_rq) ++ return; ++ ++ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, ++ blk_rq_pos(bfqq->next_rq), &parent, &p); ++ if (!__bfqq) { ++ rb_link_node(&bfqq->pos_node, parent, p); ++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); ++ } else ++ bfqq->pos_root = NULL; ++} ++ + /* + * Tell whether there are active queues or groups with differentiated weights. + */ +@@ -528,6 +594,57 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) + return dur; + } + ++static unsigned bfq_bfqq_cooperations(struct bfq_queue *bfqq) ++{ ++ return bfqq->bic ? bfqq->bic->cooperations : 0; ++} ++ ++static void ++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) ++{ ++ if (bic->saved_idle_window) ++ bfq_mark_bfqq_idle_window(bfqq); ++ else ++ bfq_clear_bfqq_idle_window(bfqq); ++ if (bic->saved_IO_bound) ++ bfq_mark_bfqq_IO_bound(bfqq); ++ else ++ bfq_clear_bfqq_IO_bound(bfqq); ++ /* Assuming that the flag in_large_burst is already correctly set */ ++ if (bic->wr_time_left && bfqq->bfqd->low_latency && ++ !bfq_bfqq_in_large_burst(bfqq) && ++ bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { ++ /* ++ * Start a weight raising period with the duration given by ++ * the raising_time_left snapshot. ++ */ ++ if (bfq_bfqq_busy(bfqq)) ++ bfqq->bfqd->wr_busy_queues++; ++ bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bic->wr_time_left; ++ bfqq->last_wr_start_finish = jiffies; ++ bfqq->entity.prio_changed = 1; ++ } ++ /* ++ * Clear wr_time_left to prevent bfq_bfqq_save_state() from ++ * getting confused about the queue's need of a weight-raising ++ * period. ++ */ ++ bic->wr_time_left = 0; ++} ++ ++static int bfqq_process_refs(struct bfq_queue *bfqq) ++{ ++ int process_refs, io_refs; ++ ++ lockdep_assert_held(bfqq->bfqd->queue->queue_lock); ++ ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; ++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; ++ BUG_ON(process_refs < 0); ++ return process_refs; ++} ++ + /* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ + static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) + { +@@ -764,8 +881,14 @@ static void bfq_add_request(struct request *rq) + BUG_ON(!next_rq); + bfqq->next_rq = next_rq; + ++ /* ++ * Adjust priority tree position, if next_rq changes. ++ */ ++ if (prev != bfqq->next_rq) ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ + if (!bfq_bfqq_busy(bfqq)) { +- bool soft_rt, in_burst, ++ bool soft_rt, coop_or_in_burst, + idle_for_long_time = time_is_before_jiffies( + bfqq->budget_timeout + + bfqd->bfq_wr_min_idle_time); +@@ -793,11 +916,12 @@ static void bfq_add_request(struct request *rq) + bfqd->last_ins_in_burst = jiffies; + } + +- in_burst = bfq_bfqq_in_large_burst(bfqq); ++ coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) || ++ bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh; + soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && +- !in_burst && ++ !coop_or_in_burst && + time_is_before_jiffies(bfqq->soft_rt_next_start); +- interactive = !in_burst && idle_for_long_time; ++ interactive = !coop_or_in_burst && idle_for_long_time; + entity->budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + +@@ -816,6 +940,9 @@ static void bfq_add_request(struct request *rq) + if (!bfqd->low_latency) + goto add_bfqq_busy; + ++ if (bfq_bfqq_just_split(bfqq)) ++ goto set_prio_changed; ++ + /* + * If the queue: + * - is not being boosted, +@@ -840,7 +967,7 @@ static void bfq_add_request(struct request *rq) + } else if (old_wr_coeff > 1) { + if (interactive) + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); +- else if (in_burst || ++ else if (coop_or_in_burst || + (bfqq->wr_cur_max_time == + bfqd->bfq_wr_rt_max_time && + !soft_rt)) { +@@ -905,6 +1032,7 @@ static void bfq_add_request(struct request *rq) + bfqd->bfq_wr_rt_max_time; + } + } ++set_prio_changed: + if (old_wr_coeff != bfqq->wr_coeff) + entity->prio_changed = 1; + add_bfqq_busy: +@@ -1047,6 +1175,15 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, + bfqd->last_position); + BUG_ON(!next_rq); + bfqq->next_rq = next_rq; ++ /* ++ * If next_rq changes, update both the queue's budget to ++ * fit the new request and the queue's position in its ++ * rq_pos_tree. ++ */ ++ if (prev != bfqq->next_rq) { ++ bfq_updated_next_req(bfqd, bfqq); ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ } + } + } + +@@ -1129,11 +1266,346 @@ static void bfq_end_wr(struct bfq_data *bfqd) + spin_unlock_irq(bfqd->queue->queue_lock); + } + ++static sector_t bfq_io_struct_pos(void *io_struct, bool request) ++{ ++ if (request) ++ return blk_rq_pos(io_struct); ++ else ++ return ((struct bio *)io_struct)->bi_iter.bi_sector; ++} ++ ++static int bfq_rq_close_to_sector(void *io_struct, bool request, ++ sector_t sector) ++{ ++ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= ++ BFQQ_SEEK_THR; ++} ++ ++static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ sector_t sector) ++{ ++ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; ++ struct rb_node *parent, *node; ++ struct bfq_queue *__bfqq; ++ ++ if (RB_EMPTY_ROOT(root)) ++ return NULL; ++ ++ /* ++ * First, if we find a request starting at the end of the last ++ * request, choose it. ++ */ ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); ++ if (__bfqq) ++ return __bfqq; ++ ++ /* ++ * If the exact sector wasn't found, the parent of the NULL leaf ++ * will contain the closest sector (rq_pos_tree sorted by ++ * next_request position). ++ */ ++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) ++ return __bfqq; ++ ++ if (blk_rq_pos(__bfqq->next_rq) < sector) ++ node = rb_next(&__bfqq->pos_node); ++ else ++ node = rb_prev(&__bfqq->pos_node); ++ if (!node) ++ return NULL; ++ ++ __bfqq = rb_entry(node, struct bfq_queue, pos_node); ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) ++ return __bfqq; ++ ++ return NULL; ++} ++ ++static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, ++ struct bfq_queue *cur_bfqq, ++ sector_t sector) ++{ ++ struct bfq_queue *bfqq; ++ ++ /* ++ * We shall notice if some of the queues are cooperating, ++ * e.g., working closely on the same area of the device. In ++ * that case, we can group them together and: 1) don't waste ++ * time idling, and 2) serve the union of their requests in ++ * the best possible order for throughput. ++ */ ++ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); ++ if (!bfqq || bfqq == cur_bfqq) ++ return NULL; ++ ++ return bfqq; ++} ++ ++static struct bfq_queue * ++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++{ ++ int process_refs, new_process_refs; ++ struct bfq_queue *__bfqq; ++ ++ /* ++ * If there are no process references on the new_bfqq, then it is ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain ++ * may have dropped their last reference (not just their last process ++ * reference). ++ */ ++ if (!bfqq_process_refs(new_bfqq)) ++ return NULL; ++ ++ /* Avoid a circular list and skip interim queue merges. */ ++ while ((__bfqq = new_bfqq->new_bfqq)) { ++ if (__bfqq == bfqq) ++ return NULL; ++ new_bfqq = __bfqq; ++ } ++ ++ process_refs = bfqq_process_refs(bfqq); ++ new_process_refs = bfqq_process_refs(new_bfqq); ++ /* ++ * If the process for the bfqq has gone away, there is no ++ * sense in merging the queues. ++ */ ++ if (process_refs == 0 || new_process_refs == 0) ++ return NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", ++ new_bfqq->pid); ++ ++ /* ++ * Merging is just a redirection: the requests of the process ++ * owning one of the two queues are redirected to the other queue. ++ * The latter queue, in its turn, is set as shared if this is the ++ * first time that the requests of some process are redirected to ++ * it. ++ * ++ * We redirect bfqq to new_bfqq and not the opposite, because we ++ * are in the context of the process owning bfqq, hence we have ++ * the io_cq of this process. So we can immediately configure this ++ * io_cq to redirect the requests of the process to new_bfqq. ++ * ++ * NOTE, even if new_bfqq coincides with the in-service queue, the ++ * io_cq of new_bfqq is not available, because, if the in-service ++ * queue is shared, bfqd->in_service_bic may not point to the ++ * io_cq of the in-service queue. ++ * Redirecting the requests of the process owning bfqq to the ++ * currently in-service queue is in any case the best option, as ++ * we feed the in-service queue with new requests close to the ++ * last request served and, by doing so, hopefully increase the ++ * throughput. ++ */ ++ bfqq->new_bfqq = new_bfqq; ++ atomic_add(process_refs, &new_bfqq->ref); ++ return new_bfqq; ++} ++ ++static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, ++ struct bfq_queue *new_bfqq) ++{ ++ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || ++ (bfqq->ioprio_class != new_bfqq->ioprio_class)) ++ return false; ++ ++ /* ++ * If either of the queues has already been detected as seeky, ++ * then merging it with the other queue is unlikely to lead to ++ * sequential I/O. ++ */ ++ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) ++ return false; ++ ++ /* ++ * Interleaved I/O is known to be done by (some) applications ++ * only for reads, so it does not make sense to merge async ++ * queues. ++ */ ++ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) ++ return false; ++ ++ return true; ++} ++ ++/* ++ * Attempt to schedule a merge of bfqq with the currently in-service queue ++ * or with a close queue among the scheduled queues. ++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue ++ * structure otherwise. ++ * ++ * The OOM queue is not allowed to participate to cooperation: in fact, since ++ * the requests temporarily redirected to the OOM queue could be redirected ++ * again to dedicated queues at any time, the state needed to correctly ++ * handle merging with the OOM queue would be quite complex and expensive ++ * to maintain. Besides, in such a critical condition as an out of memory, ++ * the benefits of queue merging may be little relevant, or even negligible. ++ */ ++static struct bfq_queue * ++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ void *io_struct, bool request) ++{ ++ struct bfq_queue *in_service_bfqq, *new_bfqq; ++ ++ if (bfqq->new_bfqq) ++ return bfqq->new_bfqq; ++ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) ++ return NULL; ++ /* If device has only one backlogged bfq_queue, don't search. */ ++ if (bfqd->busy_queues == 1) ++ return NULL; ++ ++ in_service_bfqq = bfqd->in_service_queue; ++ ++ if (!in_service_bfqq || in_service_bfqq == bfqq || ++ !bfqd->in_service_bic || ++ unlikely(in_service_bfqq == &bfqd->oom_bfqq)) ++ goto check_scheduled; ++ ++ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && ++ bfqq->entity.parent == in_service_bfqq->entity.parent && ++ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { ++ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); ++ if (new_bfqq) ++ return new_bfqq; ++ } ++ /* ++ * Check whether there is a cooperator among currently scheduled ++ * queues. The only thing we need is that the bio/request is not ++ * NULL, as we need it to establish whether a cooperator exists. ++ */ ++check_scheduled: ++ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, ++ bfq_io_struct_pos(io_struct, request)); ++ ++ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); ++ ++ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && ++ bfq_may_be_close_cooperator(bfqq, new_bfqq)) ++ return bfq_setup_merge(bfqq, new_bfqq); ++ ++ return NULL; ++} ++ ++static void bfq_bfqq_save_state(struct bfq_queue *bfqq) ++{ ++ /* ++ * If !bfqq->bic, the queue is already shared or its requests ++ * have already been redirected to a shared queue; both idle window ++ * and weight raising state have already been saved. Do nothing. ++ */ ++ if (!bfqq->bic) ++ return; ++ if (bfqq->bic->wr_time_left) ++ /* ++ * This is the queue of a just-started process, and would ++ * deserve weight raising: we set wr_time_left to the full ++ * weight-raising duration to trigger weight-raising when ++ * and if the queue is split and the first request of the ++ * queue is enqueued. ++ */ ++ bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); ++ else if (bfqq->wr_coeff > 1) { ++ unsigned long wr_duration = ++ jiffies - bfqq->last_wr_start_finish; ++ /* ++ * It may happen that a queue's weight raising period lasts ++ * longer than its wr_cur_max_time, as weight raising is ++ * handled only when a request is enqueued or dispatched (it ++ * does not use any timer). If the weight raising period is ++ * about to end, don't save it. ++ */ ++ if (bfqq->wr_cur_max_time <= wr_duration) ++ bfqq->bic->wr_time_left = 0; ++ else ++ bfqq->bic->wr_time_left = ++ bfqq->wr_cur_max_time - wr_duration; ++ /* ++ * The bfq_queue is becoming shared or the requests of the ++ * process owning the queue are being redirected to a shared ++ * queue. Stop the weight raising period of the queue, as in ++ * both cases it should not be owned by an interactive or ++ * soft real-time application. ++ */ ++ bfq_bfqq_end_wr(bfqq); ++ } else ++ bfqq->bic->wr_time_left = 0; ++ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); ++ bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); ++ bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); ++ bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); ++ bfqq->bic->cooperations++; ++ bfqq->bic->failed_cooperations = 0; ++} ++ ++static void bfq_get_bic_reference(struct bfq_queue *bfqq) ++{ ++ /* ++ * If bfqq->bic has a non-NULL value, the bic to which it belongs ++ * is about to begin using a shared bfq_queue. ++ */ ++ if (bfqq->bic) ++ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); ++} ++ ++static void ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, ++ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++{ ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", ++ (long unsigned)new_bfqq->pid); ++ /* Save weight raising and idle window of the merged queues */ ++ bfq_bfqq_save_state(bfqq); ++ bfq_bfqq_save_state(new_bfqq); ++ if (bfq_bfqq_IO_bound(bfqq)) ++ bfq_mark_bfqq_IO_bound(new_bfqq); ++ bfq_clear_bfqq_IO_bound(bfqq); ++ /* ++ * Grab a reference to the bic, to prevent it from being destroyed ++ * before being possibly touched by a bfq_split_bfqq(). ++ */ ++ bfq_get_bic_reference(bfqq); ++ bfq_get_bic_reference(new_bfqq); ++ /* ++ * Merge queues (that is, let bic redirect its requests to new_bfqq) ++ */ ++ bic_set_bfqq(bic, new_bfqq, 1); ++ bfq_mark_bfqq_coop(new_bfqq); ++ /* ++ * new_bfqq now belongs to at least two bics (it is a shared queue): ++ * set new_bfqq->bic to NULL. bfqq either: ++ * - does not belong to any bic any more, and hence bfqq->bic must ++ * be set to NULL, or ++ * - is a queue whose owning bics have already been redirected to a ++ * different queue, hence the queue is destined to not belong to ++ * any bic soon and bfqq->bic is already NULL (therefore the next ++ * assignment causes no harm). ++ */ ++ new_bfqq->bic = NULL; ++ bfqq->bic = NULL; ++ bfq_put_queue(bfqq); ++} ++ ++static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) ++{ ++ struct bfq_io_cq *bic = bfqq->bic; ++ struct bfq_data *bfqd = bfqq->bfqd; ++ ++ if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { ++ bic->failed_cooperations++; ++ if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) ++ bic->cooperations = 0; ++ } ++} ++ + static int bfq_allow_merge(struct request_queue *q, struct request *rq, + struct bio *bio) + { + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq, *new_bfqq; + + /* + * Disallow merge of a sync bio into an async request. +@@ -1150,7 +1622,26 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, + if (!bic) + return 0; + +- return bic_to_bfqq(bic, bfq_bio_sync(bio)) == RQ_BFQQ(rq); ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); ++ /* ++ * We take advantage of this function to perform an early merge ++ * of the queues of possible cooperating processes. ++ */ ++ if (bfqq) { ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); ++ if (new_bfqq) { ++ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); ++ /* ++ * If we get here, the bio will be queued in the ++ * shared queue, i.e., new_bfqq, so use new_bfqq ++ * to decide whether bio and rq can be merged. ++ */ ++ bfqq = new_bfqq; ++ } else ++ bfq_bfqq_increase_failed_cooperations(bfqq); ++ } ++ ++ return bfqq == RQ_BFQQ(rq); + } + + static void __bfq_set_in_service_queue(struct bfq_data *bfqd, +@@ -1349,6 +1840,15 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) + + __bfq_bfqd_reset_in_service(bfqd); + ++ /* ++ * If this bfqq is shared between multiple processes, check ++ * to make sure that those processes are still issuing I/Os ++ * within the mean seek distance. If not, it may be time to ++ * break the queues apart again. ++ */ ++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) ++ bfq_mark_bfqq_split_coop(bfqq); ++ + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + /* + * Overloading budget_timeout field to store the time +@@ -1357,8 +1857,13 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) + */ + bfqq->budget_timeout = jiffies; + bfq_del_bfqq_busy(bfqd, bfqq, 1); +- } else ++ } else { + bfq_activate_bfqq(bfqd, bfqq); ++ /* ++ * Resort priority tree of potential close cooperators. ++ */ ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ } + } + + /** +@@ -2242,10 +2747,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) + /* + * If the queue was activated in a burst, or + * too much time has elapsed from the beginning +- * of this weight-raising period, then end weight +- * raising. ++ * of this weight-raising period, or the queue has ++ * exceeded the acceptable number of cooperations, ++ * then end weight raising. + */ + if (bfq_bfqq_in_large_burst(bfqq) || ++ bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || + time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time)) { + bfqq->last_wr_start_finish = jiffies; +@@ -2474,6 +2981,25 @@ static void bfq_put_queue(struct bfq_queue *bfqq) + #endif + } + ++static void bfq_put_cooperator(struct bfq_queue *bfqq) ++{ ++ struct bfq_queue *__bfqq, *next; ++ ++ /* ++ * If this queue was scheduled to merge with another queue, be ++ * sure to drop the reference taken on that queue (and others in ++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. ++ */ ++ __bfqq = bfqq->new_bfqq; ++ while (__bfqq) { ++ if (__bfqq == bfqq) ++ break; ++ next = __bfqq->new_bfqq; ++ bfq_put_queue(__bfqq); ++ __bfqq = next; ++ } ++} ++ + static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) + { + if (bfqq == bfqd->in_service_queue) { +@@ -2484,6 +3010,8 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) + bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, + atomic_read(&bfqq->ref)); + ++ bfq_put_cooperator(bfqq); ++ + bfq_put_queue(bfqq); + } + +@@ -2492,6 +3020,25 @@ static void bfq_init_icq(struct io_cq *icq) + struct bfq_io_cq *bic = icq_to_bic(icq); + + bic->ttime.last_end_request = jiffies; ++ /* ++ * A newly created bic indicates that the process has just ++ * started doing I/O, and is probably mapping into memory its ++ * executable and libraries: it definitely needs weight raising. ++ * There is however the possibility that the process performs, ++ * for a while, I/O close to some other process. EQM intercepts ++ * this behavior and may merge the queue corresponding to the ++ * process with some other queue, BEFORE the weight of the queue ++ * is raised. Merged queues are not weight-raised (they are assumed ++ * to belong to processes that benefit only from high throughput). ++ * If the merge is basically the consequence of an accident, then ++ * the queue will be split soon and will get back its old weight. ++ * It is then important to write down somewhere that this queue ++ * does need weight raising, even if it did not make it to get its ++ * weight raised before being merged. To this purpose, we overload ++ * the field raising_time_left and assign 1 to it, to mark the queue ++ * as needing weight raising. ++ */ ++ bic->wr_time_left = 1; + } + + static void bfq_exit_icq(struct io_cq *icq) +@@ -2505,6 +3052,13 @@ static void bfq_exit_icq(struct io_cq *icq) + } + + if (bic->bfqq[BLK_RW_SYNC]) { ++ /* ++ * If the bic is using a shared queue, put the reference ++ * taken on the io_context when the bic started using a ++ * shared bfq_queue. ++ */ ++ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) ++ put_io_context(icq->ioc); + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); + bic->bfqq[BLK_RW_SYNC] = NULL; + } +@@ -2809,6 +3363,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) + return; + ++ /* Idle window just restored, statistics are meaningless. */ ++ if (bfq_bfqq_just_split(bfqq)) ++ return; ++ + enable_idle = bfq_bfqq_idle_window(bfqq); + + if (atomic_read(&bic->icq.ioc->active_ref) == 0 || +@@ -2856,6 +3414,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || + !BFQQ_SEEKY(bfqq)) + bfq_update_idle_window(bfqd, bfqq, bic); ++ bfq_clear_bfqq_just_split(bfqq); + + bfq_log_bfqq(bfqd, bfqq, + "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", +@@ -2920,12 +3479,47 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + static void bfq_insert_request(struct request_queue *q, struct request *rq) + { + struct bfq_data *bfqd = q->elevator->elevator_data; +- struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; + + assert_spin_locked(bfqd->queue->queue_lock); + ++ /* ++ * An unplug may trigger a requeue of a request from the device ++ * driver: make sure we are in process context while trying to ++ * merge two bfq_queues. ++ */ ++ if (!in_interrupt()) { ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); ++ if (new_bfqq) { ++ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) ++ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); ++ /* ++ * Release the request's reference to the old bfqq ++ * and make sure one is taken to the shared queue. ++ */ ++ new_bfqq->allocated[rq_data_dir(rq)]++; ++ bfqq->allocated[rq_data_dir(rq)]--; ++ atomic_inc(&new_bfqq->ref); ++ bfq_put_queue(bfqq); ++ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) ++ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), ++ bfqq, new_bfqq); ++ rq->elv.priv[1] = new_bfqq; ++ bfqq = new_bfqq; ++ } else ++ bfq_bfqq_increase_failed_cooperations(bfqq); ++ } ++ + bfq_add_request(rq); + ++ /* ++ * Here a newly-created bfq_queue has already started a weight-raising ++ * period: clear raising_time_left to prevent bfq_bfqq_save_state() ++ * from assigning it a full weight-raising period. See the detailed ++ * comments about this field in bfq_init_icq(). ++ */ ++ if (bfqq->bic) ++ bfqq->bic->wr_time_left = 0; + rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; + list_add_tail(&rq->queuelist, &bfqq->fifo); + +@@ -3094,6 +3688,32 @@ static void bfq_put_request(struct request *rq) + } + + /* ++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this ++ * was the last process referring to said bfqq. ++ */ ++static struct bfq_queue * ++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); ++ ++ put_io_context(bic->icq.ioc); ++ ++ if (bfqq_process_refs(bfqq) == 1) { ++ bfqq->pid = current->pid; ++ bfq_clear_bfqq_coop(bfqq); ++ bfq_clear_bfqq_split_coop(bfqq); ++ return bfqq; ++ } ++ ++ bic_set_bfqq(bic, NULL, 1); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); ++ return NULL; ++} ++ ++/* + * Allocate bfq data structures associated with this request. + */ + static int bfq_set_request(struct request_queue *q, struct request *rq, +@@ -3105,6 +3725,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, + const int is_sync = rq_is_sync(rq); + struct bfq_queue *bfqq; + unsigned long flags; ++ bool split = false; + + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); + +@@ -3117,15 +3738,30 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, + + bfq_bic_update_cgroup(bic, bio); + ++new_queue: + bfqq = bic_to_bfqq(bic, is_sync); + if (!bfqq || bfqq == &bfqd->oom_bfqq) { + bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); + bic_set_bfqq(bic, bfqq, is_sync); +- if (is_sync) { +- if (bfqd->large_burst) ++ if (split && is_sync) { ++ if ((bic->was_in_burst_list && bfqd->large_burst) || ++ bic->saved_in_large_burst) + bfq_mark_bfqq_in_large_burst(bfqq); +- else +- bfq_clear_bfqq_in_large_burst(bfqq); ++ else { ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ if (bic->was_in_burst_list) ++ hlist_add_head(&bfqq->burst_list_node, ++ &bfqd->burst_list); ++ } ++ } ++ } else { ++ /* If the queue was seeky for too long, break it apart. */ ++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { ++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); ++ bfqq = bfq_split_bfqq(bic, bfqq); ++ split = true; ++ if (!bfqq) ++ goto new_queue; + } + } + +@@ -3137,6 +3773,26 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, + rq->elv.priv[0] = bic; + rq->elv.priv[1] = bfqq; + ++ /* ++ * If a bfq_queue has only one process reference, it is owned ++ * by only one bfq_io_cq: we can set the bic field of the ++ * bfq_queue to the address of that structure. Also, if the ++ * queue has just been split, mark a flag so that the ++ * information is available to the other scheduler hooks. ++ */ ++ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { ++ bfqq->bic = bic; ++ if (split) { ++ bfq_mark_bfqq_just_split(bfqq); ++ /* ++ * If the queue has just been split from a shared ++ * queue, restore the idle window and the possible ++ * weight raising period. ++ */ ++ bfq_bfqq_resume_state(bfqq, bic); ++ } ++ } ++ + spin_unlock_irqrestore(q->queue_lock, flags); + + return 0; +@@ -3290,6 +3946,7 @@ static void bfq_init_root_group(struct bfq_group *root_group, + root_group->my_entity = NULL; + root_group->bfqd = bfqd; + #endif ++ root_group->rq_pos_tree = RB_ROOT; + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) + root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; + } +@@ -3370,6 +4027,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; + bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; + ++ bfqd->bfq_coop_thresh = 2; ++ bfqd->bfq_failed_cooperations = 7000; + bfqd->bfq_requests_within_timer = 120; + + bfqd->bfq_large_burst_thresh = 11; +diff --git a/block/bfq.h b/block/bfq.h +index 3bb7df2..32dfcee 100644 +--- a/block/bfq.h ++++ b/block/bfq.h +@@ -183,6 +183,8 @@ struct bfq_group; + * ioprio_class value. + * @new_bfqq: shared bfq_queue if queue is cooperating with + * one or more other queues. ++ * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree). ++ * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree). + * @sort_list: sorted list of pending requests. + * @next_rq: if fifo isn't expired, next request to serve. + * @queued: nr of requests queued in @sort_list. +@@ -304,6 +306,26 @@ struct bfq_ttime { + * @ttime: associated @bfq_ttime struct + * @ioprio: per (request_queue, blkcg) ioprio. + * @blkcg_id: id of the blkcg the related io_cq belongs to. ++ * @wr_time_left: snapshot of the time left before weight raising ends ++ * for the sync queue associated to this process; this ++ * snapshot is taken to remember this value while the weight ++ * raising is suspended because the queue is merged with a ++ * shared queue, and is used to set @raising_cur_max_time ++ * when the queue is split from the shared queue and its ++ * weight is raised again ++ * @saved_idle_window: same purpose as the previous field for the idle ++ * window ++ * @saved_IO_bound: same purpose as the previous two fields for the I/O ++ * bound classification of a queue ++ * @saved_in_large_burst: same purpose as the previous fields for the ++ * value of the field keeping the queue's belonging ++ * to a large burst ++ * @was_in_burst_list: true if the queue belonged to a burst list ++ * before its merge with another cooperating queue ++ * @cooperations: counter of consecutive successful queue merges underwent ++ * by any of the process' @bfq_queues ++ * @failed_cooperations: counter of consecutive failed queue merges of any ++ * of the process' @bfq_queues + */ + struct bfq_io_cq { + struct io_cq icq; /* must be the first member */ +@@ -314,6 +336,16 @@ struct bfq_io_cq { + #ifdef CONFIG_BFQ_GROUP_IOSCHED + uint64_t blkcg_id; /* the current blkcg ID */ + #endif ++ ++ unsigned int wr_time_left; ++ bool saved_idle_window; ++ bool saved_IO_bound; ++ ++ bool saved_in_large_burst; ++ bool was_in_burst_list; ++ ++ unsigned int cooperations; ++ unsigned int failed_cooperations; + }; + + enum bfq_device_speed { +@@ -557,6 +589,9 @@ enum bfqq_state_flags { + * may need softrt-next-start + * update + */ ++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ ++ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ ++ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ + }; + + #define BFQ_BFQQ_FNS(name) \ +@@ -583,6 +618,9 @@ BFQ_BFQQ_FNS(budget_new); + BFQ_BFQQ_FNS(IO_bound); + BFQ_BFQQ_FNS(in_large_burst); + BFQ_BFQQ_FNS(constantly_seeky); ++BFQ_BFQQ_FNS(coop); ++BFQ_BFQQ_FNS(split_coop); ++BFQ_BFQQ_FNS(just_split); + BFQ_BFQQ_FNS(softrt_update); + #undef BFQ_BFQQ_FNS + +@@ -675,6 +713,9 @@ struct bfq_group_data { + * are groups with more than one active @bfq_entity + * (see the comments to the function + * bfq_bfqq_must_not_expire()). ++ * @rq_pos_tree: rbtree sorted by next_request position, used when ++ * determining if two or more queues have interleaving ++ * requests (see bfq_find_close_cooperator()). + * + * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup + * there is a set of bfq_groups, each one collecting the lower-level +@@ -701,6 +742,8 @@ struct bfq_group { + + int active_entities; + ++ struct rb_root rq_pos_tree; ++ + struct bfqg_stats stats; + struct bfqg_stats dead_stats; /* stats pushed from dead children */ + }; +@@ -711,6 +754,8 @@ struct bfq_group { + + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_idle_bfqq; ++ ++ struct rb_root rq_pos_tree; + }; + #endif + +@@ -787,6 +832,27 @@ static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) + spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); + } + ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ ++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *group_entity = bfqq->entity.parent; ++ ++ if (!group_entity) ++ group_entity = &bfqq->bfqd->root_group->entity; ++ ++ return container_of(group_entity, struct bfq_group, entity); ++} ++ ++#else ++ ++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) ++{ ++ return bfqq->bfqd->root_group; ++} ++ ++#endif ++ + static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); + static void bfq_put_queue(struct bfq_queue *bfqq); + static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); +-- +1.9.1 + diff --git a/ata-prefer-ata-drivers-over-ide-drivers-when-both-are-built.patch b/ata-prefer-ata-drivers-over-ide-drivers-when-both-are-built.patch new file mode 100644 index 0000000..9cbf1b0 --- /dev/null +++ b/ata-prefer-ata-drivers-over-ide-drivers-when-both-are-built.patch @@ -0,0 +1,34 @@ +>From 9f04e51293b130474504216a477bb2a73cbf59e1 Mon Sep 17 00:00:00 2001 +From: Anssi Hannula +Date: Thu, 22 Mar 2012 22:29:11 +0200 +Subject: [PATCH] ata: prefer ata drivers over ide drivers when both are built + +Currently the old IDE drivers are preferred over ATA drivers when both +are built, since ide/ is listed first in drivers/Makefile and therefore +the IDE drivers end up before ATA drivers in modules.order which is used +by depmod/modprobe for module ordering. + +Change it so that ATA drivers are preferred over IDE driver by moving +the ide/ entry under ata/ in drivers/Makefile. + +Signed-off-by: Anssi Hannula +--- + drivers/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/Makefile b/drivers/Makefile +index 795d0ca..ff3c239 100644 +--- a/drivers/Makefile ++++ b/drivers/Makefile +@@ -69,10 +69,10 @@ obj-$(CONFIG_LIBNVDIMM) += nvdimm/ + obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/ + obj-$(CONFIG_NUBUS) += nubus/ + obj-y += macintosh/ +-obj-$(CONFIG_IDE) += ide/ + obj-$(CONFIG_SCSI) += scsi/ + obj-y += nvme/ + obj-$(CONFIG_ATA) += ata/ ++obj-$(CONFIG_IDE) += ide/ + obj-$(CONFIG_TARGET_CORE) += target/ + obj-$(CONFIG_MTD) += mtd/ + obj-$(CONFIG_SPI) += spi/ diff --git a/audit-make-it-less-verbose.patch b/audit-make-it-less-verbose.patch new file mode 100644 index 0000000..39d4c5d --- /dev/null +++ b/audit-make-it-less-verbose.patch @@ -0,0 +1,22 @@ +It seems, if audit itself is not installed and therefore nothing listens +to the messages from the kernel's audit subsystem, the latter spams the +kernel log with such messages. + +Let us make them debug-level and thus invisible by default. + +http://bugs.rosalinux.ru/show_bug.cgi?id=6235 +http://bugs.rosalinux.ru/show_bug.cgi?id=6459 + +diff --git a/kernel/audit.c b/kernel/audit.c +index 1c13e42..56270ce 100644 +--- a/kernel/audit.c ++++ b/kernel/audit.c +@@ -396,7 +396,7 @@ static void audit_printk_skb(struct sk_buff *skb) + + if (nlh->nlmsg_type != AUDIT_EOE) { + if (printk_ratelimit()) +- pr_notice("type=%d %s\n", nlh->nlmsg_type, data); ++ pr_debug("type=%d %s\n", nlh->nlmsg_type, data); + else + audit_log_lost("printk limit exceeded"); + } diff --git a/block-floppy-disable-pnp-modalias.patch b/block-floppy-disable-pnp-modalias.patch new file mode 100644 index 0000000..c994472 --- /dev/null +++ b/block-floppy-disable-pnp-modalias.patch @@ -0,0 +1,21 @@ + +Disable floppy autoloading as it makes systems without real hw, +but that has the pnp headers hang. + +See https://bugs.mageia.org/show_bug.cgi?id=4696 + +Signed-off-by: Thomas Backlund + +diff -Nurp linux-3.3.3-rc1/drivers/block/floppy.c.orig linux-3.3.3-rc1/drivers/block/floppy.c +--- linux-3.3.3-rc1/drivers/block/floppy.c.orig 2012-03-19 01:15:34.000000000 +0200 ++++ linux-3.3.3-rc1/drivers/block/floppy.c 2012-04-22 02:58:10.238498303 +0300 +@@ -4621,8 +4621,7 @@ static const struct pnp_device_id floppy + {"PNP0700", 0}, + {} + }; +- +-MODULE_DEVICE_TABLE(pnp, floppy_pnpids); ++/* MODULE_DEVICE_TABLE(pnp, floppy_pnpids); */ + + #else + diff --git a/drm-cirrus-Use-16bpp-as-default.patch b/drm-cirrus-Use-16bpp-as-default.patch new file mode 100644 index 0000000..a4e5eb5 --- /dev/null +++ b/drm-cirrus-Use-16bpp-as-default.patch @@ -0,0 +1,49 @@ +From: Takashi Iwai +Date: Fri, 25 Jan 2013 17:08:03 +0100 +Subject: [PATCH] drm/cirrus: Use 16bpp as default +Patch-mainline: Submitted +References: bnc#799216 + +We've got a bug report that GNOME on QEMU shows wrong colors. +It turned out that it's because Cairo doesn't support 24bpp well. +This hasn't been an issue until now because we (at least SUSE and +Fedora) have a patch to use 16bpp for QEMU in Xorg cirrus UMS driver. + +Since cirrus KMS driver is mainly targeted for the use on QEMU/KVM, we +should choose 16bpp as default, too. + +Also, it's not convenient to set the default bpp in multiple places. +cirrus_fbdev_init() should check the original preferred depth set in +cirrus_modeset_init(). + +Bugzilla: https://bugzilla.novell.com/show_bug.cgi?id=799216 + +Signed-off-by: Takashi Iwai + +--- + drivers/gpu/drm/cirrus/cirrus_fbdev.c | 2 +- + drivers/gpu/drm/cirrus/cirrus_mode.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/gpu/drm/cirrus/cirrus_fbdev.c ++++ b/drivers/gpu/drm/cirrus/cirrus_fbdev.c +@@ -273,7 +273,7 @@ int cirrus_fbdev_init(struct cirrus_devi + { + struct cirrus_fbdev *gfbdev; + int ret; +- int bpp_sel = 24; ++ int bpp_sel = cdev->dev->mode_config.preferred_depth; + + /*bpp_sel = 8;*/ + gfbdev = kzalloc(sizeof(struct cirrus_fbdev), GFP_KERNEL); +--- a/drivers/gpu/drm/cirrus/cirrus_mode.c ++++ b/drivers/gpu/drm/cirrus/cirrus_mode.c +@@ -588,7 +588,7 @@ int cirrus_modeset_init(struct cirrus_de + cdev->dev->mode_config.max_height = CIRRUS_MAX_FB_HEIGHT; + + cdev->dev->mode_config.fb_base = cdev->mc.vram_base; +- cdev->dev->mode_config.preferred_depth = 24; ++ cdev->dev->mode_config.preferred_depth = 16; + /* don't prefer a shadow on virt GPU */ + cdev->dev->mode_config.prefer_shadow = 0; + diff --git a/fs-aufs4.patch b/fs-aufs4.patch new file mode 100644 index 0000000..d153623 --- /dev/null +++ b/fs-aufs4.patch @@ -0,0 +1,35794 @@ +diff --git a/Documentation/ABI/testing/debugfs-aufs b/Documentation/ABI/testing/debugfs-aufs +new file mode 100644 +index 0000000..99642d1 +--- /dev/null ++++ b/Documentation/ABI/testing/debugfs-aufs +@@ -0,0 +1,50 @@ ++What: /debug/aufs/si_/ ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ Under /debug/aufs, a directory named si_ is created ++ per aufs mount, where is a unique id generated ++ internally. ++ ++What: /debug/aufs/si_/plink ++Date: Apr 2013 ++Contact: J. R. Okajima ++Description: ++ It has three lines and shows the information about the ++ pseudo-link. The first line is a single number ++ representing a number of buckets. The second line is a ++ number of pseudo-links per buckets (separated by a ++ blank). The last line is a single number representing a ++ total number of psedo-links. ++ When the aufs mount option 'noplink' is specified, it ++ will show "1\n0\n0\n". ++ ++What: /debug/aufs/si_/xib ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ It shows the consumed blocks by xib (External Inode Number ++ Bitmap), its block size and file size. ++ When the aufs mount option 'noxino' is specified, it ++ will be empty. About XINO files, see the aufs manual. ++ ++What: /debug/aufs/si_/xino0, xino1 ... xinoN ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ It shows the consumed blocks by xino (External Inode Number ++ Translation Table), its link count, block size and file ++ size. ++ When the aufs mount option 'noxino' is specified, it ++ will be empty. About XINO files, see the aufs manual. ++ ++What: /debug/aufs/si_/xigen ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ It shows the consumed blocks by xigen (External Inode ++ Generation Table), its block size and file size. ++ If CONFIG_AUFS_EXPORT is disabled, this entry will not ++ be created. ++ When the aufs mount option 'noxino' is specified, it ++ will be empty. About XINO files, see the aufs manual. +diff --git a/Documentation/ABI/testing/sysfs-aufs b/Documentation/ABI/testing/sysfs-aufs +new file mode 100644 +index 0000000..82f9518 +--- /dev/null ++++ b/Documentation/ABI/testing/sysfs-aufs +@@ -0,0 +1,31 @@ ++What: /sys/fs/aufs/si_/ ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ Under /sys/fs/aufs, a directory named si_ is created ++ per aufs mount, where is a unique id generated ++ internally. ++ ++What: /sys/fs/aufs/si_/br0, br1 ... brN ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ It shows the abolute path of a member directory (which ++ is called branch) in aufs, and its permission. ++ ++What: /sys/fs/aufs/si_/brid0, brid1 ... bridN ++Date: July 2013 ++Contact: J. R. Okajima ++Description: ++ It shows the id of a member directory (which is called ++ branch) in aufs. ++ ++What: /sys/fs/aufs/si_/xi_path ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ It shows the abolute path of XINO (External Inode Number ++ Bitmap, Translation Table and Generation Table) file ++ even if it is the default path. ++ When the aufs mount option 'noxino' is specified, it ++ will be empty. About XINO files, see the aufs manual. +diff --git a/Documentation/filesystems/aufs/README b/Documentation/filesystems/aufs/README +new file mode 100644 +index 0000000..36df674 +--- /dev/null ++++ b/Documentation/filesystems/aufs/README +@@ -0,0 +1,392 @@ ++ ++Aufs4 -- advanced multi layered unification filesystem version 4.x ++http://aufs.sf.net ++Junjiro R. Okajima ++ ++ ++0. Introduction ++---------------------------------------- ++In the early days, aufs was entirely re-designed and re-implemented ++Unionfs Version 1.x series. Adding many original ideas, approaches, ++improvements and implementations, it becomes totally different from ++Unionfs while keeping the basic features. ++Recently, Unionfs Version 2.x series begin taking some of the same ++approaches to aufs1's. ++Unionfs is being developed by Professor Erez Zadok at Stony Brook ++University and his team. ++ ++Aufs4 supports linux-4.0 and later, and for linux-3.x series try aufs3. ++If you want older kernel version support, try aufs2-2.6.git or ++aufs2-standalone.git repository, aufs1 from CVS on SourceForge. ++ ++Note: it becomes clear that "Aufs was rejected. Let's give it up." ++ According to Christoph Hellwig, linux rejects all union-type ++ filesystems but UnionMount. ++ ++ ++PS. Al Viro seems have a plan to merge aufs as well as overlayfs and ++ UnionMount, and he pointed out an issue around a directory mutex ++ lock and aufs addressed it. But it is still unsure whether aufs will ++ be merged (or any other union solution). ++ ++ ++ ++1. Features ++---------------------------------------- ++- unite several directories into a single virtual filesystem. The member ++ directory is called as a branch. ++- you can specify the permission flags to the branch, which are 'readonly', ++ 'readwrite' and 'whiteout-able.' ++- by upper writable branch, internal copyup and whiteout, files/dirs on ++ readonly branch are modifiable logically. ++- dynamic branch manipulation, add, del. ++- etc... ++ ++Also there are many enhancements in aufs, such as: ++- test only the highest one for the directory permission (dirperm1) ++- copyup on open (coo=) ++- 'move' policy for copy-up between two writable branches, after ++ checking free space. ++- xattr, acl ++- readdir(3) in userspace. ++- keep inode number by external inode number table ++- keep the timestamps of file/dir in internal copyup operation ++- seekable directory, supporting NFS readdir. ++- whiteout is hardlinked in order to reduce the consumption of inodes ++ on branch ++- do not copyup, nor create a whiteout when it is unnecessary ++- revert a single systemcall when an error occurs in aufs ++- remount interface instead of ioctl ++- maintain /etc/mtab by an external command, /sbin/mount.aufs. ++- loopback mounted filesystem as a branch ++- kernel thread for removing the dir who has a plenty of whiteouts ++- support copyup sparse file (a file which has a 'hole' in it) ++- default permission flags for branches ++- selectable permission flags for ro branch, whether whiteout can ++ exist or not ++- export via NFS. ++- support /fs/aufs and /aufs. ++- support multiple writable branches, some policies to select one ++ among multiple writable branches. ++- a new semantics for link(2) and rename(2) to support multiple ++ writable branches. ++- no glibc changes are required. ++- pseudo hardlink (hardlink over branches) ++- allow a direct access manually to a file on branch, e.g. bypassing aufs. ++ including NFS or remote filesystem branch. ++- userspace wrapper for pathconf(3)/fpathconf(3) with _PC_LINK_MAX. ++- and more... ++ ++Currently these features are dropped temporary from aufs4. ++See design/08plan.txt in detail. ++- nested mount, i.e. aufs as readonly no-whiteout branch of another aufs ++ (robr) ++- statistics of aufs thread (/sys/fs/aufs/stat) ++ ++Features or just an idea in the future (see also design/*.txt), ++- reorder the branch index without del/re-add. ++- permanent xino files for NFSD ++- an option for refreshing the opened files after add/del branches ++- light version, without branch manipulation. (unnecessary?) ++- copyup in userspace ++- inotify in userspace ++- readv/writev ++ ++ ++2. Download ++---------------------------------------- ++There are three GIT trees for aufs4, aufs4-linux.git, ++aufs4-standalone.git, and aufs-util.git. Note that there is no "4" in ++"aufs-util.git." ++While the aufs-util is always necessary, you need either of aufs4-linux ++or aufs4-standalone. ++ ++The aufs4-linux tree includes the whole linux mainline GIT tree, ++git://git.kernel.org/.../torvalds/linux.git. ++And you cannot select CONFIG_AUFS_FS=m for this version, eg. you cannot ++build aufs4 as an external kernel module. ++Several extra patches are not included in this tree. Only ++aufs4-standalone tree contains them. They are described in the later ++section "Configuration and Compilation." ++ ++On the other hand, the aufs4-standalone tree has only aufs source files ++and necessary patches, and you can select CONFIG_AUFS_FS=m. ++But you need to apply all aufs patches manually. ++ ++You will find GIT branches whose name is in form of "aufs4.x" where "x" ++represents the linux kernel version, "linux-4.x". For instance, ++"aufs4.0" is for linux-4.0. For latest "linux-4.x-rcN", use ++"aufs4.x-rcN" branch. ++ ++o aufs4-linux tree ++$ git clone --reference /your/linux/git/tree \ ++ git://github.com/sfjro/aufs4-linux.git aufs4-linux.git ++- if you don't have linux GIT tree, then remove "--reference ..." ++$ cd aufs4-linux.git ++$ git checkout origin/aufs4.0 ++ ++Or You may want to directly git-pull aufs into your linux GIT tree, and ++leave the patch-work to GIT. ++$ cd /your/linux/git/tree ++$ git remote add aufs4 git://github.com/sfjro/aufs4-linux.git ++$ git fetch aufs4 ++$ git checkout -b my4.0 v4.0 ++$ (add your local change...) ++$ git pull aufs4 aufs4.0 ++- now you have v4.0 + your_changes + aufs4.0 in you my4.0 branch. ++- you may need to solve some conflicts between your_changes and ++ aufs4.0. in this case, git-rerere is recommended so that you can ++ solve the similar conflicts automatically when you upgrade to 4.1 or ++ later in the future. ++ ++o aufs4-standalone tree ++$ git clone git://github.com/sfjro/aufs4-standalone.git aufs4-standalone.git ++$ cd aufs4-standalone.git ++$ git checkout origin/aufs4.0 ++ ++o aufs-util tree ++$ git clone git://git.code.sf.net/p/aufs/aufs-util aufs-util.git ++- note that the public aufs-util.git is on SourceForge instead of ++ GitHUB. ++$ cd aufs-util.git ++$ git checkout origin/aufs4.0 ++ ++Note: The 4.x-rcN branch is to be used with `rc' kernel versions ONLY. ++The minor version number, 'x' in '4.x', of aufs may not always ++follow the minor version number of the kernel. ++Because changes in the kernel that cause the use of a new ++minor version number do not always require changes to aufs-util. ++ ++Since aufs-util has its own minor version number, you may not be ++able to find a GIT branch in aufs-util for your kernel's ++exact minor version number. ++In this case, you should git-checkout the branch for the ++nearest lower number. ++ ++For (an unreleased) example: ++If you are using "linux-4.10" and the "aufs4.10" branch ++does not exist in aufs-util repository, then "aufs4.9", "aufs4.8" ++or something numerically smaller is the branch for your kernel. ++ ++Also you can view all branches by ++ $ git branch -a ++ ++ ++3. Configuration and Compilation ++---------------------------------------- ++Make sure you have git-checkout'ed the correct branch. ++ ++For aufs4-linux tree, ++- enable CONFIG_AUFS_FS. ++- set other aufs configurations if necessary. ++ ++For aufs4-standalone tree, ++There are several ways to build. ++ ++1. ++- apply ./aufs4-kbuild.patch to your kernel source files. ++- apply ./aufs4-base.patch too. ++- apply ./aufs4-mmap.patch too. ++- apply ./aufs4-standalone.patch too, if you have a plan to set ++ CONFIG_AUFS_FS=m. otherwise you don't need ./aufs4-standalone.patch. ++- copy ./{Documentation,fs,include/uapi/linux/aufs_type.h} files to your ++ kernel source tree. Never copy $PWD/include/uapi/linux/Kbuild. ++- enable CONFIG_AUFS_FS, you can select either ++ =m or =y. ++- and build your kernel as usual. ++- install the built kernel. ++ Note: Since linux-3.9, every filesystem module requires an alias ++ "fs-". You should make sure that "fs-aufs" is listed in your ++ modules.aliases file if you set CONFIG_AUFS_FS=m. ++- install the header files too by "make headers_install" to the ++ directory where you specify. By default, it is $PWD/usr. ++ "make help" shows a brief note for headers_install. ++- and reboot your system. ++ ++2. ++- module only (CONFIG_AUFS_FS=m). ++- apply ./aufs4-base.patch to your kernel source files. ++- apply ./aufs4-mmap.patch too. ++- apply ./aufs4-standalone.patch too. ++- build your kernel, don't forget "make headers_install", and reboot. ++- edit ./config.mk and set other aufs configurations if necessary. ++ Note: You should read $PWD/fs/aufs/Kconfig carefully which describes ++ every aufs configurations. ++- build the module by simple "make". ++ Note: Since linux-3.9, every filesystem module requires an alias ++ "fs-". You should make sure that "fs-aufs" is listed in your ++ modules.aliases file. ++- you can specify ${KDIR} make variable which points to your kernel ++ source tree. ++- install the files ++ + run "make install" to install the aufs module, or copy the built ++ $PWD/aufs.ko to /lib/modules/... and run depmod -a (or reboot simply). ++ + run "make install_headers" (instead of headers_install) to install ++ the modified aufs header file (you can specify DESTDIR which is ++ available in aufs standalone version's Makefile only), or copy ++ $PWD/usr/include/linux/aufs_type.h to /usr/include/linux or wherever ++ you like manually. By default, the target directory is $PWD/usr. ++- no need to apply aufs4-kbuild.patch, nor copying source files to your ++ kernel source tree. ++ ++Note: The header file aufs_type.h is necessary to build aufs-util ++ as well as "make headers_install" in the kernel source tree. ++ headers_install is subject to be forgotten, but it is essentially ++ necessary, not only for building aufs-util. ++ You may not meet problems without headers_install in some older ++ version though. ++ ++And then, ++- read README in aufs-util, build and install it ++- note that your distribution may contain an obsoleted version of ++ aufs_type.h in /usr/include/linux or something. When you build aufs ++ utilities, make sure that your compiler refers the correct aufs header ++ file which is built by "make headers_install." ++- if you want to use readdir(3) in userspace or pathconf(3) wrapper, ++ then run "make install_ulib" too. And refer to the aufs manual in ++ detail. ++ ++There several other patches in aufs4-standalone.git. They are all ++optional. When you meet some problems, they will help you. ++- aufs4-loopback.patch ++ Supports a nested loopback mount in a branch-fs. This patch is ++ unnecessary until aufs produces a message like "you may want to try ++ another patch for loopback file". ++- vfs-ino.patch ++ Modifies a system global kernel internal function get_next_ino() in ++ order to stop assigning 0 for an inode-number. Not directly related to ++ aufs, but recommended generally. ++- tmpfs-idr.patch ++ Keeps the tmpfs inode number as the lowest value. Effective to reduce ++ the size of aufs XINO files for tmpfs branch. Also it prevents the ++ duplication of inode number, which is important for backup tools and ++ other utilities. When you find aufs XINO files for tmpfs branch ++ growing too much, try this patch. ++- lockdep-debug.patch ++ Because aufs is not only an ordinary filesystem (callee of VFS), but ++ also a caller of VFS functions for branch filesystems, subclassing of ++ the internal locks for LOCKDEP is necessary. LOCKDEP is a debugging ++ feature of linux kernel. If you enable CONFIG_LOCKDEP, then you will ++ need to apply this debug patch to expand several constant values. ++ If don't know what LOCKDEP, then you don't have apply this patch. ++ ++ ++4. Usage ++---------------------------------------- ++At first, make sure aufs-util are installed, and please read the aufs ++manual, aufs.5 in aufs-util.git tree. ++$ man -l aufs.5 ++ ++And then, ++$ mkdir /tmp/rw /tmp/aufs ++# mount -t aufs -o br=/tmp/rw:${HOME} none /tmp/aufs ++ ++Here is another example. The result is equivalent. ++# mount -t aufs -o br=/tmp/rw=rw:${HOME}=ro none /tmp/aufs ++ Or ++# mount -t aufs -o br:/tmp/rw none /tmp/aufs ++# mount -o remount,append:${HOME} /tmp/aufs ++ ++Then, you can see whole tree of your home dir through /tmp/aufs. If ++you modify a file under /tmp/aufs, the one on your home directory is ++not affected, instead the same named file will be newly created under ++/tmp/rw. And all of your modification to a file will be applied to ++the one under /tmp/rw. This is called the file based Copy on Write ++(COW) method. ++Aufs mount options are described in aufs.5. ++If you run chroot or something and make your aufs as a root directory, ++then you need to customize the shutdown script. See the aufs manual in ++detail. ++ ++Additionally, there are some sample usages of aufs which are a ++diskless system with network booting, and LiveCD over NFS. ++See sample dir in CVS tree on SourceForge. ++ ++ ++5. Contact ++---------------------------------------- ++When you have any problems or strange behaviour in aufs, please let me ++know with: ++- /proc/mounts (instead of the output of mount(8)) ++- /sys/module/aufs/* ++- /sys/fs/aufs/* (if you have them) ++- /debug/aufs/* (if you have them) ++- linux kernel version ++ if your kernel is not plain, for example modified by distributor, ++ the url where i can download its source is necessary too. ++- aufs version which was printed at loading the module or booting the ++ system, instead of the date you downloaded. ++- configuration (define/undefine CONFIG_AUFS_xxx) ++- kernel configuration or /proc/config.gz (if you have it) ++- behaviour which you think to be incorrect ++- actual operation, reproducible one is better ++- mailto: aufs-users at lists.sourceforge.net ++ ++Usually, I don't watch the Public Areas(Bugs, Support Requests, Patches, ++and Feature Requests) on SourceForge. Please join and write to ++aufs-users ML. ++ ++ ++6. Acknowledgements ++---------------------------------------- ++Thanks to everyone who have tried and are using aufs, whoever ++have reported a bug or any feedback. ++ ++Especially donators: ++Tomas Matejicek(slax.org) made a donation (much more than once). ++ Since Apr 2010, Tomas M (the author of Slax and Linux Live ++ scripts) is making "doubling" donations. ++ Unfortunately I cannot list all of the donators, but I really ++ appreciate. ++ It ends Aug 2010, but the ordinary donation URL is still available. ++ ++Dai Itasaka made a donation (2007/8). ++Chuck Smith made a donation (2008/4, 10 and 12). ++Henk Schoneveld made a donation (2008/9). ++Chih-Wei Huang, ASUS, CTC donated Eee PC 4G (2008/10). ++Francois Dupoux made a donation (2008/11). ++Bruno Cesar Ribas and Luis Carlos Erpen de Bona, C3SL serves public ++ aufs2 GIT tree (2009/2). ++William Grant made a donation (2009/3). ++Patrick Lane made a donation (2009/4). ++The Mail Archive (mail-archive.com) made donations (2009/5). ++Nippy Networks (Ed Wildgoose) made a donation (2009/7). ++New Dream Network, LLC (www.dreamhost.com) made a donation (2009/11). ++Pavel Pronskiy made a donation (2011/2). ++Iridium and Inmarsat satellite phone retailer (www.mailasail.com), Nippy ++ Networks (Ed Wildgoose) made a donation for hardware (2011/3). ++Max Lekomcev (DOM-TV project) made a donation (2011/7, 12, 2012/3, 6 and ++11). ++Sam Liddicott made a donation (2011/9). ++Era Scarecrow made a donation (2013/4). ++Bor Ratajc made a donation (2013/4). ++Alessandro Gorreta made a donation (2013/4). ++POIRETTE Marc made a donation (2013/4). ++Alessandro Gorreta made a donation (2013/4). ++lauri kasvandik made a donation (2013/5). ++"pemasu from Finland" made a donation (2013/7). ++The Parted Magic Project made a donation (2013/9 and 11). ++Pavel Barta made a donation (2013/10). ++Nikolay Pertsev made a donation (2014/5). ++James B made a donation (2014/7 and 2015/7). ++Stefano Di Biase made a donation (2014/8). ++Daniel Epellei made a donation (2015/1). ++OmegaPhil made a donation (2016/1). ++Tomasz Szewczyk made a donation (2016/4). ++ ++Thank you very much. ++Donations are always, including future donations, very important and ++helpful for me to keep on developing aufs. ++ ++ ++7. ++---------------------------------------- ++If you are an experienced user, no explanation is needed. Aufs is ++just a linux filesystem. ++ ++ ++Enjoy! ++ ++# Local variables: ; ++# mode: text; ++# End: ; +diff --git a/Documentation/filesystems/aufs/design/01intro.txt b/Documentation/filesystems/aufs/design/01intro.txt +new file mode 100644 +index 0000000..2799da9 +--- /dev/null ++++ b/Documentation/filesystems/aufs/design/01intro.txt +@@ -0,0 +1,170 @@ ++ ++# Copyright (C) 2005-2016 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++ ++Introduction ++---------------------------------------- ++ ++aufs [ei ju: ef es] | [a u f s] ++1. abbrev. for "advanced multi-layered unification filesystem". ++2. abbrev. for "another unionfs". ++3. abbrev. for "auf das" in German which means "on the" in English. ++ Ex. "Butter aufs Brot"(G) means "butter onto bread"(E). ++ But "Filesystem aufs Filesystem" is hard to understand. ++ ++AUFS is a filesystem with features: ++- multi layered stackable unification filesystem, the member directory ++ is called as a branch. ++- branch permission and attribute, 'readonly', 'real-readonly', ++ 'readwrite', 'whiteout-able', 'link-able whiteout', etc. and their ++ combination. ++- internal "file copy-on-write". ++- logical deletion, whiteout. ++- dynamic branch manipulation, adding, deleting and changing permission. ++- allow bypassing aufs, user's direct branch access. ++- external inode number translation table and bitmap which maintains the ++ persistent aufs inode number. ++- seekable directory, including NFS readdir. ++- file mapping, mmap and sharing pages. ++- pseudo-link, hardlink over branches. ++- loopback mounted filesystem as a branch. ++- several policies to select one among multiple writable branches. ++- revert a single systemcall when an error occurs in aufs. ++- and more... ++ ++ ++Multi Layered Stackable Unification Filesystem ++---------------------------------------------------------------------- ++Most people already knows what it is. ++It is a filesystem which unifies several directories and provides a ++merged single directory. When users access a file, the access will be ++passed/re-directed/converted (sorry, I am not sure which English word is ++correct) to the real file on the member filesystem. The member ++filesystem is called 'lower filesystem' or 'branch' and has a mode ++'readonly' and 'readwrite.' And the deletion for a file on the lower ++readonly branch is handled by creating 'whiteout' on the upper writable ++branch. ++ ++On LKML, there have been discussions about UnionMount (Jan Blunck, ++Bharata B Rao and Valerie Aurora) and Unionfs (Erez Zadok). They took ++different approaches to implement the merged-view. ++The former tries putting it into VFS, and the latter implements as a ++separate filesystem. ++(If I misunderstand about these implementations, please let me know and ++I shall correct it. Because it is a long time ago when I read their ++source files last time). ++ ++UnionMount's approach will be able to small, but may be hard to share ++branches between several UnionMount since the whiteout in it is ++implemented in the inode on branch filesystem and always ++shared. According to Bharata's post, readdir does not seems to be ++finished yet. ++There are several missing features known in this implementations such as ++- for users, the inode number may change silently. eg. copy-up. ++- link(2) may break by copy-up. ++- read(2) may get an obsoleted filedata (fstat(2) too). ++- fcntl(F_SETLK) may be broken by copy-up. ++- unnecessary copy-up may happen, for example mmap(MAP_PRIVATE) after ++ open(O_RDWR). ++ ++In linux-3.18, "overlay" filesystem (formerly known as "overlayfs") was ++merged into mainline. This is another implementation of UnionMount as a ++separated filesystem. All the limitations and known problems which ++UnionMount are equally inherited to "overlay" filesystem. ++ ++Unionfs has a longer history. When I started implementing a stackable ++filesystem (Aug 2005), it already existed. It has virtual super_block, ++inode, dentry and file objects and they have an array pointing lower ++same kind objects. After contributing many patches for Unionfs, I ++re-started my project AUFS (Jun 2006). ++ ++In AUFS, the structure of filesystem resembles to Unionfs, but I ++implemented my own ideas, approaches and enhancements and it became ++totally different one. ++ ++Comparing DM snapshot and fs based implementation ++- the number of bytes to be copied between devices is much smaller. ++- the type of filesystem must be one and only. ++- the fs must be writable, no readonly fs, even for the lower original ++ device. so the compression fs will not be usable. but if we use ++ loopback mount, we may address this issue. ++ for instance, ++ mount /cdrom/squashfs.img /sq ++ losetup /sq/ext2.img ++ losetup /somewhere/cow ++ dmsetup "snapshot /dev/loop0 /dev/loop1 ..." ++- it will be difficult (or needs more operations) to extract the ++ difference between the original device and COW. ++- DM snapshot-merge may help a lot when users try merging. in the ++ fs-layer union, users will use rsync(1). ++ ++You may want to read my old paper "Filesystems in LiveCD" ++(http://aufs.sourceforge.net/aufs2/report/sq/sq.pdf). ++ ++ ++Several characters/aspects/persona of aufs ++---------------------------------------------------------------------- ++ ++Aufs has several characters, aspects or persona. ++1. a filesystem, callee of VFS helper ++2. sub-VFS, caller of VFS helper for branches ++3. a virtual filesystem which maintains persistent inode number ++4. reader/writer of files on branches such like an application ++ ++1. Callee of VFS Helper ++As an ordinary linux filesystem, aufs is a callee of VFS. For instance, ++unlink(2) from an application reaches sys_unlink() kernel function and ++then vfs_unlink() is called. vfs_unlink() is one of VFS helper and it ++calls filesystem specific unlink operation. Actually aufs implements the ++unlink operation but it behaves like a redirector. ++ ++2. Caller of VFS Helper for Branches ++aufs_unlink() passes the unlink request to the branch filesystem as if ++it were called from VFS. So the called unlink operation of the branch ++filesystem acts as usual. As a caller of VFS helper, aufs should handle ++every necessary pre/post operation for the branch filesystem. ++- acquire the lock for the parent dir on a branch ++- lookup in a branch ++- revalidate dentry on a branch ++- mnt_want_write() for a branch ++- vfs_unlink() for a branch ++- mnt_drop_write() for a branch ++- release the lock on a branch ++ ++3. Persistent Inode Number ++One of the most important issue for a filesystem is to maintain inode ++numbers. This is particularly important to support exporting a ++filesystem via NFS. Aufs is a virtual filesystem which doesn't have a ++backend block device for its own. But some storage is necessary to ++keep and maintain the inode numbers. It may be a large space and may not ++suit to keep in memory. Aufs rents some space from its first writable ++branch filesystem (by default) and creates file(s) on it. These files ++are created by aufs internally and removed soon (currently) keeping ++opened. ++Note: Because these files are removed, they are totally gone after ++ unmounting aufs. It means the inode numbers are not persistent ++ across unmount or reboot. I have a plan to make them really ++ persistent which will be important for aufs on NFS server. ++ ++4. Read/Write Files Internally (copy-on-write) ++Because a branch can be readonly, when you write a file on it, aufs will ++"copy-up" it to the upper writable branch internally. And then write the ++originally requested thing to the file. Generally kernel doesn't ++open/read/write file actively. In aufs, even a single write may cause a ++internal "file copy". This behaviour is very similar to cp(1) command. ++ ++Some people may think it is better to pass such work to user space ++helper, instead of doing in kernel space. Actually I am still thinking ++about it. But currently I have implemented it in kernel space. +diff --git a/Documentation/filesystems/aufs/design/02struct.txt b/Documentation/filesystems/aufs/design/02struct.txt +new file mode 100644 +index 0000000..6ad4a5b +--- /dev/null ++++ b/Documentation/filesystems/aufs/design/02struct.txt +@@ -0,0 +1,258 @@ ++ ++# Copyright (C) 2005-2016 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++ ++Basic Aufs Internal Structure ++ ++Superblock/Inode/Dentry/File Objects ++---------------------------------------------------------------------- ++As like an ordinary filesystem, aufs has its own ++superblock/inode/dentry/file objects. All these objects have a ++dynamically allocated array and store the same kind of pointers to the ++lower filesystem, branch. ++For example, when you build a union with one readwrite branch and one ++readonly, mounted /au, /rw and /ro respectively. ++- /au = /rw + /ro ++- /ro/fileA exists but /rw/fileA ++ ++Aufs lookup operation finds /ro/fileA and gets dentry for that. These ++pointers are stored in a aufs dentry. The array in aufs dentry will be, ++- [0] = NULL (because /rw/fileA doesn't exist) ++- [1] = /ro/fileA ++ ++This style of an array is essentially same to the aufs ++superblock/inode/dentry/file objects. ++ ++Because aufs supports manipulating branches, ie. add/delete/change ++branches dynamically, these objects has its own generation. When ++branches are changed, the generation in aufs superblock is ++incremented. And a generation in other object are compared when it is ++accessed. When a generation in other objects are obsoleted, aufs ++refreshes the internal array. ++ ++ ++Superblock ++---------------------------------------------------------------------- ++Additionally aufs superblock has some data for policies to select one ++among multiple writable branches, XIB files, pseudo-links and kobject. ++See below in detail. ++About the policies which supports copy-down a directory, see ++wbr_policy.txt too. ++ ++ ++Branch and XINO(External Inode Number Translation Table) ++---------------------------------------------------------------------- ++Every branch has its own xino (external inode number translation table) ++file. The xino file is created and unlinked by aufs internally. When two ++members of a union exist on the same filesystem, they share the single ++xino file. ++The struct of a xino file is simple, just a sequence of aufs inode ++numbers which is indexed by the lower inode number. ++In the above sample, assume the inode number of /ro/fileA is i111 and ++aufs assigns the inode number i999 for fileA. Then aufs writes 999 as ++4(8) bytes at 111 * 4(8) bytes offset in the xino file. ++ ++When the inode numbers are not contiguous, the xino file will be sparse ++which has a hole in it and doesn't consume as much disk space as it ++might appear. If your branch filesystem consumes disk space for such ++holes, then you should specify 'xino=' option at mounting aufs. ++ ++Aufs has a mount option to free the disk blocks for such holes in XINO ++files on tmpfs or ramdisk. But it is not so effective actually. If you ++meet a problem of disk shortage due to XINO files, then you should try ++"tmpfs-ino.patch" (and "vfs-ino.patch" too) in aufs4-standalone.git. ++The patch localizes the assignment inumbers per tmpfs-mount and avoid ++the holes in XINO files. ++ ++Also a writable branch has three kinds of "whiteout bases". All these ++are existed when the branch is joined to aufs, and their names are ++whiteout-ed doubly, so that users will never see their names in aufs ++hierarchy. ++1. a regular file which will be hardlinked to all whiteouts. ++2. a directory to store a pseudo-link. ++3. a directory to store an "orphan"-ed file temporary. ++ ++1. Whiteout Base ++ When you remove a file on a readonly branch, aufs handles it as a ++ logical deletion and creates a whiteout on the upper writable branch ++ as a hardlink of this file in order not to consume inode on the ++ writable branch. ++2. Pseudo-link Dir ++ See below, Pseudo-link. ++3. Step-Parent Dir ++ When "fileC" exists on the lower readonly branch only and it is ++ opened and removed with its parent dir, and then user writes ++ something into it, then aufs copies-up fileC to this ++ directory. Because there is no other dir to store fileC. After ++ creating a file under this dir, the file is unlinked. ++ ++Because aufs supports manipulating branches, ie. add/delete/change ++dynamically, a branch has its own id. When the branch order changes, ++aufs finds the new index by searching the branch id. ++ ++ ++Pseudo-link ++---------------------------------------------------------------------- ++Assume "fileA" exists on the lower readonly branch only and it is ++hardlinked to "fileB" on the branch. When you write something to fileA, ++aufs copies-up it to the upper writable branch. Additionally aufs ++creates a hardlink under the Pseudo-link Directory of the writable ++branch. The inode of a pseudo-link is kept in aufs super_block as a ++simple list. If fileB is read after unlinking fileA, aufs returns ++filedata from the pseudo-link instead of the lower readonly ++branch. Because the pseudo-link is based upon the inode, to keep the ++inode number by xino (see above) is essentially necessary. ++ ++All the hardlinks under the Pseudo-link Directory of the writable branch ++should be restored in a proper location later. Aufs provides a utility ++to do this. The userspace helpers executed at remounting and unmounting ++aufs by default. ++During this utility is running, it puts aufs into the pseudo-link ++maintenance mode. In this mode, only the process which began the ++maintenance mode (and its child processes) is allowed to operate in ++aufs. Some other processes which are not related to the pseudo-link will ++be allowed to run too, but the rest have to return an error or wait ++until the maintenance mode ends. If a process already acquires an inode ++mutex (in VFS), it has to return an error. ++ ++ ++XIB(external inode number bitmap) ++---------------------------------------------------------------------- ++Addition to the xino file per a branch, aufs has an external inode number ++bitmap in a superblock object. It is also an internal file such like a ++xino file. ++It is a simple bitmap to mark whether the aufs inode number is in-use or ++not. ++To reduce the file I/O, aufs prepares a single memory page to cache xib. ++ ++As well as XINO files, aufs has a feature to truncate/refresh XIB to ++reduce the number of consumed disk blocks for these files. ++ ++ ++Virtual or Vertical Dir, and Readdir in Userspace ++---------------------------------------------------------------------- ++In order to support multiple layers (branches), aufs readdir operation ++constructs a virtual dir block on memory. For readdir, aufs calls ++vfs_readdir() internally for each dir on branches, merges their entries ++with eliminating the whiteout-ed ones, and sets it to file (dir) ++object. So the file object has its entry list until it is closed. The ++entry list will be updated when the file position is zero and becomes ++obsoleted. This decision is made in aufs automatically. ++ ++The dynamically allocated memory block for the name of entries has a ++unit of 512 bytes (by default) and stores the names contiguously (no ++padding). Another block for each entry is handled by kmem_cache too. ++During building dir blocks, aufs creates hash list and judging whether ++the entry is whiteouted by its upper branch or already listed. ++The merged result is cached in the corresponding inode object and ++maintained by a customizable life-time option. ++ ++Some people may call it can be a security hole or invite DoS attack ++since the opened and once readdir-ed dir (file object) holds its entry ++list and becomes a pressure for system memory. But I'd say it is similar ++to files under /proc or /sys. The virtual files in them also holds a ++memory page (generally) while they are opened. When an idea to reduce ++memory for them is introduced, it will be applied to aufs too. ++For those who really hate this situation, I've developed readdir(3) ++library which operates this merging in userspace. You just need to set ++LD_PRELOAD environment variable, and aufs will not consume no memory in ++kernel space for readdir(3). ++ ++ ++Workqueue ++---------------------------------------------------------------------- ++Aufs sometimes requires privilege access to a branch. For instance, ++in copy-up/down operation. When a user process is going to make changes ++to a file which exists in the lower readonly branch only, and the mode ++of one of ancestor directories may not be writable by a user ++process. Here aufs copy-up the file with its ancestors and they may ++require privilege to set its owner/group/mode/etc. ++This is a typical case of a application character of aufs (see ++Introduction). ++ ++Aufs uses workqueue synchronously for this case. It creates its own ++workqueue. The workqueue is a kernel thread and has privilege. Aufs ++passes the request to call mkdir or write (for example), and wait for ++its completion. This approach solves a problem of a signal handler ++simply. ++If aufs didn't adopt the workqueue and changed the privilege of the ++process, then the process may receive the unexpected SIGXFSZ or other ++signals. ++ ++Also aufs uses the system global workqueue ("events" kernel thread) too ++for asynchronous tasks, such like handling inotify/fsnotify, re-creating a ++whiteout base and etc. This is unrelated to a privilege. ++Most of aufs operation tries acquiring a rw_semaphore for aufs ++superblock at the beginning, at the same time waits for the completion ++of all queued asynchronous tasks. ++ ++ ++Whiteout ++---------------------------------------------------------------------- ++The whiteout in aufs is very similar to Unionfs's. That is represented ++by its filename. UnionMount takes an approach of a file mode, but I am ++afraid several utilities (find(1) or something) will have to support it. ++ ++Basically the whiteout represents "logical deletion" which stops aufs to ++lookup further, but also it represents "dir is opaque" which also stop ++further lookup. ++ ++In aufs, rmdir(2) and rename(2) for dir uses whiteout alternatively. ++In order to make several functions in a single systemcall to be ++revertible, aufs adopts an approach to rename a directory to a temporary ++unique whiteouted name. ++For example, in rename(2) dir where the target dir already existed, aufs ++renames the target dir to a temporary unique whiteouted name before the ++actual rename on a branch, and then handles other actions (make it opaque, ++update the attributes, etc). If an error happens in these actions, aufs ++simply renames the whiteouted name back and returns an error. If all are ++succeeded, aufs registers a function to remove the whiteouted unique ++temporary name completely and asynchronously to the system global ++workqueue. ++ ++ ++Copy-up ++---------------------------------------------------------------------- ++It is a well-known feature or concept. ++When user modifies a file on a readonly branch, aufs operate "copy-up" ++internally and makes change to the new file on the upper writable branch. ++When the trigger systemcall does not update the timestamps of the parent ++dir, aufs reverts it after copy-up. ++ ++ ++Move-down (aufs3.9 and later) ++---------------------------------------------------------------------- ++"Copy-up" is one of the essential feature in aufs. It copies a file from ++the lower readonly branch to the upper writable branch when a user ++changes something about the file. ++"Move-down" is an opposite action of copy-up. Basically this action is ++ran manually instead of automatically and internally. ++For desgin and implementation, aufs has to consider these issues. ++- whiteout for the file may exist on the lower branch. ++- ancestor directories may not exist on the lower branch. ++- diropq for the ancestor directories may exist on the upper branch. ++- free space on the lower branch will reduce. ++- another access to the file may happen during moving-down, including ++ UDBA (see "Revalidate Dentry and UDBA"). ++- the file should not be hard-linked nor pseudo-linked. they should be ++ handled by auplink utility later. ++ ++Sometimes users want to move-down a file from the upper writable branch ++to the lower readonly or writable branch. For instance, ++- the free space of the upper writable branch is going to run out. ++- create a new intermediate branch between the upper and lower branch. ++- etc. ++ ++For this purpose, use "aumvdown" command in aufs-util.git. +diff --git a/Documentation/filesystems/aufs/design/03atomic_open.txt b/Documentation/filesystems/aufs/design/03atomic_open.txt +new file mode 100644 +index 0000000..9cfd1da +--- /dev/null ++++ b/Documentation/filesystems/aufs/design/03atomic_open.txt +@@ -0,0 +1,85 @@ ++ ++# Copyright (C) 2015-2016 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++ ++Support for a branch who has its ->atomic_open() ++---------------------------------------------------------------------- ++The filesystems who implement its ->atomic_open() are not majority. For ++example NFSv4 does, and aufs should call NFSv4 ->atomic_open, ++particularly for open(O_CREAT|O_EXCL, 0400) case. Other than ++->atomic_open(), NFSv4 returns an error for this open(2). While I am not ++sure whether all filesystems who have ->atomic_open() behave like this, ++but NFSv4 surely returns the error. ++ ++In order to support ->atomic_open() for aufs, there are a few ++approaches. ++ ++A. Introduce aufs_atomic_open() ++ - calls one of VFS:do_last(), lookup_open() or atomic_open() for ++ branch fs. ++B. Introduce aufs_atomic_open() calling create, open and chmod. this is ++ an aufs user Pip Cet's approach ++ - calls aufs_create(), VFS finish_open() and notify_change(). ++ - pass fake-mode to finish_open(), and then correct the mode by ++ notify_change(). ++C. Extend aufs_open() to call branch fs's ->atomic_open() ++ - no aufs_atomic_open(). ++ - aufs_lookup() registers the TID to an aufs internal object. ++ - aufs_create() does nothing when the matching TID is registered, but ++ registers the mode. ++ - aufs_open() calls branch fs's ->atomic_open() when the matching ++ TID is registered. ++D. Extend aufs_open() to re-try branch fs's ->open() with superuser's ++ credential ++ - no aufs_atomic_open(). ++ - aufs_create() registers the TID to an internal object. this info ++ represents "this process created this file just now." ++ - when aufs gets EACCES from branch fs's ->open(), then confirm the ++ registered TID and re-try open() with superuser's credential. ++ ++Pros and cons for each approach. ++ ++A. ++ - straightforward but highly depends upon VFS internal. ++ - the atomic behavaiour is kept. ++ - some of parameters such as nameidata are hard to reproduce for ++ branch fs. ++ - large overhead. ++B. ++ - easy to implement. ++ - the atomic behavaiour is lost. ++C. ++ - the atomic behavaiour is kept. ++ - dirty and tricky. ++ - VFS checks whether the file is created correctly after calling ++ ->create(), which means this approach doesn't work. ++D. ++ - easy to implement. ++ - the atomic behavaiour is lost. ++ - to open a file with superuser's credential and give it to a user ++ process is a bad idea, since the file object keeps the credential ++ in it. It may affect LSM or something. This approach doesn't work ++ either. ++ ++The approach A is ideal, but it hard to implement. So here is a ++variation of A, which is to be implemented. ++ ++A-1. Introduce aufs_atomic_open() ++ - calls branch fs ->atomic_open() if exists. otherwise calls ++ vfs_create() and finish_open(). ++ - the demerit is that the several checks after branch fs ++ ->atomic_open() are lost. in the ordinary case, the checks are ++ done by VFS:do_last(), lookup_open() and atomic_open(). some can ++ be implemented in aufs, but not all I am afraid. +diff --git a/Documentation/filesystems/aufs/design/03lookup.txt b/Documentation/filesystems/aufs/design/03lookup.txt +new file mode 100644 +index 0000000..a75b663 +--- /dev/null ++++ b/Documentation/filesystems/aufs/design/03lookup.txt +@@ -0,0 +1,113 @@ ++ ++# Copyright (C) 2005-2016 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++ ++Lookup in a Branch ++---------------------------------------------------------------------- ++Since aufs has a character of sub-VFS (see Introduction), it operates ++lookup for branches as VFS does. It may be a heavy work. But almost all ++lookup operation in aufs is the simplest case, ie. lookup only an entry ++directly connected to its parent. Digging down the directory hierarchy ++is unnecessary. VFS has a function lookup_one_len() for that use, and ++aufs calls it. ++ ++When a branch is a remote filesystem, aufs basically relies upon its ++->d_revalidate(), also aufs forces the hardest revalidate tests for ++them. ++For d_revalidate, aufs implements three levels of revalidate tests. See ++"Revalidate Dentry and UDBA" in detail. ++ ++ ++Test Only the Highest One for the Directory Permission (dirperm1 option) ++---------------------------------------------------------------------- ++Let's try case study. ++- aufs has two branches, upper readwrite and lower readonly. ++ /au = /rw + /ro ++- "dirA" exists under /ro, but /rw. and its mode is 0700. ++- user invoked "chmod a+rx /au/dirA" ++- the internal copy-up is activated and "/rw/dirA" is created and its ++ permission bits are set to world readable. ++- then "/au/dirA" becomes world readable? ++ ++In this case, /ro/dirA is still 0700 since it exists in readonly branch, ++or it may be a natively readonly filesystem. If aufs respects the lower ++branch, it should not respond readdir request from other users. But user ++allowed it by chmod. Should really aufs rejects showing the entries ++under /ro/dirA? ++ ++To be honest, I don't have a good solution for this case. So aufs ++implements 'dirperm1' and 'nodirperm1' mount options, and leave it to ++users. ++When dirperm1 is specified, aufs checks only the highest one for the ++directory permission, and shows the entries. Otherwise, as usual, checks ++every dir existing on all branches and rejects the request. ++ ++As a side effect, dirperm1 option improves the performance of aufs ++because the number of permission check is reduced when the number of ++branch is many. ++ ++ ++Revalidate Dentry and UDBA (User's Direct Branch Access) ++---------------------------------------------------------------------- ++Generally VFS helpers re-validate a dentry as a part of lookup. ++0. digging down the directory hierarchy. ++1. lock the parent dir by its i_mutex. ++2. lookup the final (child) entry. ++3. revalidate it. ++4. call the actual operation (create, unlink, etc.) ++5. unlock the parent dir ++ ++If the filesystem implements its ->d_revalidate() (step 3), then it is ++called. Actually aufs implements it and checks the dentry on a branch is ++still valid. ++But it is not enough. Because aufs has to release the lock for the ++parent dir on a branch at the end of ->lookup() (step 2) and ++->d_revalidate() (step 3) while the i_mutex of the aufs dir is still ++held by VFS. ++If the file on a branch is changed directly, eg. bypassing aufs, after ++aufs released the lock, then the subsequent operation may cause ++something unpleasant result. ++ ++This situation is a result of VFS architecture, ->lookup() and ++->d_revalidate() is separated. But I never say it is wrong. It is a good ++design from VFS's point of view. It is just not suitable for sub-VFS ++character in aufs. ++ ++Aufs supports such case by three level of revalidation which is ++selectable by user. ++1. Simple Revalidate ++ Addition to the native flow in VFS's, confirm the child-parent ++ relationship on the branch just after locking the parent dir on the ++ branch in the "actual operation" (step 4). When this validation ++ fails, aufs returns EBUSY. ->d_revalidate() (step 3) in aufs still ++ checks the validation of the dentry on branches. ++2. Monitor Changes Internally by Inotify/Fsnotify ++ Addition to above, in the "actual operation" (step 4) aufs re-lookup ++ the dentry on the branch, and returns EBUSY if it finds different ++ dentry. ++ Additionally, aufs sets the inotify/fsnotify watch for every dir on branches ++ during it is in cache. When the event is notified, aufs registers a ++ function to kernel 'events' thread by schedule_work(). And the ++ function sets some special status to the cached aufs dentry and inode ++ private data. If they are not cached, then aufs has nothing to ++ do. When the same file is accessed through aufs (step 0-3) later, ++ aufs will detect the status and refresh all necessary data. ++ In this mode, aufs has to ignore the event which is fired by aufs ++ itself. ++3. No Extra Validation ++ This is the simplest test and doesn't add any additional revalidation ++ test, and skip the revalidation in step 4. It is useful and improves ++ aufs performance when system surely hide the aufs branches from user, ++ by over-mounting something (or another method). +diff --git a/Documentation/filesystems/aufs/design/04branch.txt b/Documentation/filesystems/aufs/design/04branch.txt +new file mode 100644 +index 0000000..6203ac4 +--- /dev/null ++++ b/Documentation/filesystems/aufs/design/04branch.txt +@@ -0,0 +1,74 @@ ++ ++# Copyright (C) 2005-2016 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++ ++Branch Manipulation ++ ++Since aufs supports dynamic branch manipulation, ie. add/remove a branch ++and changing its permission/attribute, there are a lot of works to do. ++ ++ ++Add a Branch ++---------------------------------------------------------------------- ++o Confirm the adding dir exists outside of aufs, including loopback ++ mount, and its various attributes. ++o Initialize the xino file and whiteout bases if necessary. ++ See struct.txt. ++ ++o Check the owner/group/mode of the directory ++ When the owner/group/mode of the adding directory differs from the ++ existing branch, aufs issues a warning because it may impose a ++ security risk. ++ For example, when a upper writable branch has a world writable empty ++ top directory, a malicious user can create any files on the writable ++ branch directly, like copy-up and modify manually. If something like ++ /etc/{passwd,shadow} exists on the lower readonly branch but the upper ++ writable branch, and the writable branch is world-writable, then a ++ malicious guy may create /etc/passwd on the writable branch directly ++ and the infected file will be valid in aufs. ++ I am afraid it can be a security issue, but aufs can do nothing except ++ producing a warning. ++ ++ ++Delete a Branch ++---------------------------------------------------------------------- ++o Confirm the deleting branch is not busy ++ To be general, there is one merit to adopt "remount" interface to ++ manipulate branches. It is to discard caches. At deleting a branch, ++ aufs checks the still cached (and connected) dentries and inodes. If ++ there are any, then they are all in-use. An inode without its ++ corresponding dentry can be alive alone (for example, inotify/fsnotify case). ++ ++ For the cached one, aufs checks whether the same named entry exists on ++ other branches. ++ If the cached one is a directory, because aufs provides a merged view ++ to users, as long as one dir is left on any branch aufs can show the ++ dir to users. In this case, the branch can be removed from aufs. ++ Otherwise aufs rejects deleting the branch. ++ ++ If any file on the deleting branch is opened by aufs, then aufs ++ rejects deleting. ++ ++ ++Modify the Permission of a Branch ++---------------------------------------------------------------------- ++o Re-initialize or remove the xino file and whiteout bases if necessary. ++ See struct.txt. ++ ++o rw --> ro: Confirm the modifying branch is not busy ++ Aufs rejects the request if any of these conditions are true. ++ - a file on the branch is mmap-ed. ++ - a regular file on the branch is opened for write and there is no ++ same named entry on the upper branch. +diff --git a/Documentation/filesystems/aufs/design/05wbr_policy.txt b/Documentation/filesystems/aufs/design/05wbr_policy.txt +new file mode 100644 +index 0000000..539cc14 +--- /dev/null ++++ b/Documentation/filesystems/aufs/design/05wbr_policy.txt +@@ -0,0 +1,64 @@ ++ ++# Copyright (C) 2005-2016 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++ ++Policies to Select One among Multiple Writable Branches ++---------------------------------------------------------------------- ++When the number of writable branch is more than one, aufs has to decide ++the target branch for file creation or copy-up. By default, the highest ++writable branch which has the parent (or ancestor) dir of the target ++file is chosen (top-down-parent policy). ++By user's request, aufs implements some other policies to select the ++writable branch, for file creation several policies, round-robin, ++most-free-space, and other policies. For copy-up, top-down-parent, ++bottom-up-parent, bottom-up and others. ++ ++As expected, the round-robin policy selects the branch in circular. When ++you have two writable branches and creates 10 new files, 5 files will be ++created for each branch. mkdir(2) systemcall is an exception. When you ++create 10 new directories, all will be created on the same branch. ++And the most-free-space policy selects the one which has most free ++space among the writable branches. The amount of free space will be ++checked by aufs internally, and users can specify its time interval. ++ ++The policies for copy-up is more simple, ++top-down-parent is equivalent to the same named on in create policy, ++bottom-up-parent selects the writable branch where the parent dir ++exists and the nearest upper one from the copyup-source, ++bottom-up selects the nearest upper writable branch from the ++copyup-source, regardless the existence of the parent dir. ++ ++There are some rules or exceptions to apply these policies. ++- If there is a readonly branch above the policy-selected branch and ++ the parent dir is marked as opaque (a variation of whiteout), or the ++ target (creating) file is whiteout-ed on the upper readonly branch, ++ then the result of the policy is ignored and the target file will be ++ created on the nearest upper writable branch than the readonly branch. ++- If there is a writable branch above the policy-selected branch and ++ the parent dir is marked as opaque or the target file is whiteouted ++ on the branch, then the result of the policy is ignored and the target ++ file will be created on the highest one among the upper writable ++ branches who has diropq or whiteout. In case of whiteout, aufs removes ++ it as usual. ++- link(2) and rename(2) systemcalls are exceptions in every policy. ++ They try selecting the branch where the source exists as possible ++ since copyup a large file will take long time. If it can't be, ++ ie. the branch where the source exists is readonly, then they will ++ follow the copyup policy. ++- There is an exception for rename(2) when the target exists. ++ If the rename target exists, aufs compares the index of the branches ++ where the source and the target exists and selects the higher ++ one. If the selected branch is readonly, then aufs follows the ++ copyup policy. +diff --git a/Documentation/filesystems/aufs/design/06fhsm.txt b/Documentation/filesystems/aufs/design/06fhsm.txt +new file mode 100644 +index 0000000..aee7779 +--- /dev/null ++++ b/Documentation/filesystems/aufs/design/06fhsm.txt +@@ -0,0 +1,120 @@ ++ ++# Copyright (C) 2011-2016 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write to the Free Software ++# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ ++ ++File-based Hierarchical Storage Management (FHSM) ++---------------------------------------------------------------------- ++Hierarchical Storage Management (or HSM) is a well-known feature in the ++storage world. Aufs provides this feature as file-based with multiple ++writable branches, based upon the principle of "Colder, the Lower". ++Here the word "colder" means that the less used files, and "lower" means ++that the position in the order of the stacked branches vertically. ++These multiple writable branches are prioritized, ie. the topmost one ++should be the fastest drive and be used heavily. ++ ++o Characters in aufs FHSM story ++- aufs itself and a new branch attribute. ++- a new ioctl interface to move-down and to establish a connection with ++ the daemon ("move-down" is a converse of "copy-up"). ++- userspace tool and daemon. ++ ++The userspace daemon establishes a connection with aufs and waits for ++the notification. The notified information is very similar to struct ++statfs containing the number of consumed blocks and inodes. ++When the consumed blocks/inodes of a branch exceeds the user-specified ++upper watermark, the daemon activates its move-down process until the ++consumed blocks/inodes reaches the user-specified lower watermark. ++ ++The actual move-down is done by aufs based upon the request from ++user-space since we need to maintain the inode number and the internal ++pointer arrays in aufs. ++ ++Currently aufs FHSM handles the regular files only. Additionally they ++must not be hard-linked nor pseudo-linked. ++ ++ ++o Cowork of aufs and the user-space daemon ++ During the userspace daemon established the connection, aufs sends a ++ small notification to it whenever aufs writes something into the ++ writable branch. But it may cost high since aufs issues statfs(2) ++ internally. So user can specify a new option to cache the ++ info. Actually the notification is controlled by these factors. ++ + the specified cache time. ++ + classified as "force" by aufs internally. ++ Until the specified time expires, aufs doesn't send the info ++ except the forced cases. When aufs decide forcing, the info is always ++ notified to userspace. ++ For example, the number of free inodes is generally large enough and ++ the shortage of it happens rarely. So aufs doesn't force the ++ notification when creating a new file, directory and others. This is ++ the typical case which aufs doesn't force. ++ When aufs writes the actual filedata and the files consumes any of new ++ blocks, the aufs forces notifying. ++ ++ ++o Interfaces in aufs ++- New branch attribute. ++ + fhsm ++ Specifies that the branch is managed by FHSM feature. In other word, ++ participant in the FHSM. ++ When nofhsm is set to the branch, it will not be the source/target ++ branch of the move-down operation. This attribute is set ++ independently from coo and moo attributes, and if you want full ++ FHSM, you should specify them as well. ++- New mount option. ++ + fhsm_sec ++ Specifies a second to suppress many less important info to be ++ notified. ++- New ioctl. ++ + AUFS_CTL_FHSM_FD ++ create a new file descriptor which userspace can read the notification ++ (a subset of struct statfs) from aufs. ++- Module parameter 'brs' ++ It has to be set to 1. Otherwise the new mount option 'fhsm' will not ++ be set. ++- mount helpers /sbin/mount.aufs and /sbin/umount.aufs ++ When there are two or more branches with fhsm attributes, ++ /sbin/mount.aufs invokes the user-space daemon and /sbin/umount.aufs ++ terminates it. As a result of remounting and branch-manipulation, the ++ number of branches with fhsm attribute can be one. In this case, ++ /sbin/mount.aufs will terminate the user-space daemon. ++ ++ ++Finally the operation is done as these steps in kernel-space. ++- make sure that, ++ + no one else is using the file. ++ + the file is not hard-linked. ++ + the file is not pseudo-linked. ++ + the file is a regular file. ++ + the parent dir is not opaqued. ++- find the target writable branch. ++- make sure the file is not whiteout-ed by the upper (than the target) ++ branch. ++- make the parent dir on the target branch. ++- mutex lock the inode on the branch. ++- unlink the whiteout on the target branch (if exists). ++- lookup and create the whiteout-ed temporary name on the target branch. ++- copy the file as the whiteout-ed temporary name on the target branch. ++- rename the whiteout-ed temporary name to the original name. ++- unlink the file on the source branch. ++- maintain the internal pointer array and the external inode number ++ table (XINO). ++- maintain the timestamps and other attributes of the parent dir and the ++ file. ++ ++And of course, in every step, an error may happen. So the operation ++should restore the original file state after an error happens. +diff --git a/Documentation/filesystems/aufs/design/06mmap.txt b/Documentation/filesystems/aufs/design/06mmap.txt +new file mode 100644 +index 0000000..676f3af +--- /dev/null ++++ b/Documentation/filesystems/aufs/design/06mmap.txt +@@ -0,0 +1,72 @@ ++ ++# Copyright (C) 2005-2016 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++ ++mmap(2) -- File Memory Mapping ++---------------------------------------------------------------------- ++In aufs, the file-mapped pages are handled by a branch fs directly, no ++interaction with aufs. It means aufs_mmap() calls the branch fs's ++->mmap(). ++This approach is simple and good, but there is one problem. ++Under /proc, several entries show the mmapped files by its path (with ++device and inode number), and the printed path will be the path on the ++branch fs's instead of virtual aufs's. ++This is not a problem in most cases, but some utilities lsof(1) (and its ++user) may expect the path on aufs. ++ ++To address this issue, aufs adds a new member called vm_prfile in struct ++vm_area_struct (and struct vm_region). The original vm_file points to ++the file on the branch fs in order to handle everything correctly as ++usual. The new vm_prfile points to a virtual file in aufs, and the ++show-functions in procfs refers to vm_prfile if it is set. ++Also we need to maintain several other places where touching vm_file ++such like ++- fork()/clone() copies vma and the reference count of vm_file is ++ incremented. ++- merging vma maintains the ref count too. ++ ++This is not a good approach. It just fakes the printed path. But it ++leaves all behaviour around f_mapping unchanged. This is surely an ++advantage. ++Actually aufs had adopted another complicated approach which calls ++generic_file_mmap() and handles struct vm_operations_struct. In this ++approach, aufs met a hard problem and I could not solve it without ++switching the approach. ++ ++There may be one more another approach which is ++- bind-mount the branch-root onto the aufs-root internally ++- grab the new vfsmount (ie. struct mount) ++- lazy-umount the branch-root internally ++- in open(2) the aufs-file, open the branch-file with the hidden ++ vfsmount (instead of the original branch's vfsmount) ++- ideally this "bind-mount and lazy-umount" should be done atomically, ++ but it may be possible from userspace by the mount helper. ++ ++Adding the internal hidden vfsmount and using it in opening a file, the ++file path under /proc will be printed correctly. This approach looks ++smarter, but is not possible I am afraid. ++- aufs-root may be bind-mount later. when it happens, another hidden ++ vfsmount will be required. ++- it is hard to get the chance to bind-mount and lazy-umount ++ + in kernel-space, FS can have vfsmount in open(2) via ++ file->f_path, and aufs can know its vfsmount. But several locks are ++ already acquired, and if aufs tries to bind-mount and lazy-umount ++ here, then it may cause a deadlock. ++ + in user-space, bind-mount doesn't invoke the mount helper. ++- since /proc shows dev and ino, aufs has to give vma these info. it ++ means a new member vm_prinode will be necessary. this is essentially ++ equivalent to vm_prfile described above. ++ ++I have to give up this "looks-smater" approach. +diff --git a/Documentation/filesystems/aufs/design/06xattr.txt b/Documentation/filesystems/aufs/design/06xattr.txt +new file mode 100644 +index 0000000..4242fd8 +--- /dev/null ++++ b/Documentation/filesystems/aufs/design/06xattr.txt +@@ -0,0 +1,96 @@ ++ ++# Copyright (C) 2014-2016 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write to the Free Software ++# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ ++ ++Listing XATTR/EA and getting the value ++---------------------------------------------------------------------- ++For the inode standard attributes (owner, group, timestamps, etc.), aufs ++shows the values from the topmost existing file. This behaviour is good ++for the non-dir entries since the bahaviour exactly matches the shown ++information. But for the directories, aufs considers all the same named ++entries on the lower branches. Which means, if one of the lower entry ++rejects readdir call, then aufs returns an error even if the topmost ++entry allows it. This behaviour is necessary to respect the branch fs's ++security, but can make users confused since the user-visible standard ++attributes don't match the behaviour. ++To address this issue, aufs has a mount option called dirperm1 which ++checks the permission for the topmost entry only, and ignores the lower ++entry's permission. ++ ++A similar issue can happen around XATTR. ++getxattr(2) and listxattr(2) families behave as if dirperm1 option is ++always set. Otherwise these very unpleasant situation would happen. ++- listxattr(2) may return the duplicated entries. ++- users may not be able to remove or reset the XATTR forever, ++ ++ ++XATTR/EA support in the internal (copy,move)-(up,down) ++---------------------------------------------------------------------- ++Generally the extended attributes of inode are categorized as these. ++- "security" for LSM and capability. ++- "system" for posix ACL, 'acl' mount option is required for the branch ++ fs generally. ++- "trusted" for userspace, CAP_SYS_ADMIN is required. ++- "user" for userspace, 'user_xattr' mount option is required for the ++ branch fs generally. ++ ++Moreover there are some other categories. Aufs handles these rather ++unpopular categories as the ordinary ones, ie. there is no special ++condition nor exception. ++ ++In copy-up, the support for XATTR on the dst branch may differ from the ++src branch. In this case, the copy-up operation will get an error and ++the original user operation which triggered the copy-up will fail. It ++can happen that even all copy-up will fail. ++When both of src and dst branches support XATTR and if an error occurs ++during copying XATTR, then the copy-up should fail obviously. That is a ++good reason and aufs should return an error to userspace. But when only ++the src branch support that XATTR, aufs should not return an error. ++For example, the src branch supports ACL but the dst branch doesn't ++because the dst branch may natively un-support it or temporary ++un-support it due to "noacl" mount option. Of course, the dst branch fs ++may NOT return an error even if the XATTR is not supported. It is ++totally up to the branch fs. ++ ++Anyway when the aufs internal copy-up gets an error from the dst branch ++fs, then aufs tries removing the just copied entry and returns the error ++to the userspace. The worst case of this situation will be all copy-up ++will fail. ++ ++For the copy-up operation, there two basic approaches. ++- copy the specified XATTR only (by category above), and return the ++ error unconditionally if it happens. ++- copy all XATTR, and ignore the error on the specified category only. ++ ++In order to support XATTR and to implement the correct behaviour, aufs ++chooses the latter approach and introduces some new branch attributes, ++"icexsec", "icexsys", "icextr", "icexusr", and "icexoth". ++They correspond to the XATTR namespaces (see above). Additionally, to be ++convenient, "icex" is also provided which means all "icex*" attributes ++are set (here the word "icex" stands for "ignore copy-error on XATTR"). ++ ++The meaning of these attributes is to ignore the error from setting ++XATTR on that branch. ++Note that aufs tries copying all XATTR unconditionally, and ignores the ++error from the dst branch according to the specified attributes. ++ ++Some XATTR may have its default value. The default value may come from ++the parent dir or the environment. If the default value is set at the ++file creating-time, it will be overwritten by copy-up. ++Some contradiction may happen I am afraid. ++Do we need another attribute to stop copying XATTR? I am unsure. For ++now, aufs implements the branch attributes to ignore the error. +diff --git a/Documentation/filesystems/aufs/design/07export.txt b/Documentation/filesystems/aufs/design/07export.txt +new file mode 100644 +index 0000000..9a4f31f +--- /dev/null ++++ b/Documentation/filesystems/aufs/design/07export.txt +@@ -0,0 +1,58 @@ ++ ++# Copyright (C) 2005-2016 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++ ++Export Aufs via NFS ++---------------------------------------------------------------------- ++Here is an approach. ++- like xino/xib, add a new file 'xigen' which stores aufs inode ++ generation. ++- iget_locked(): initialize aufs inode generation for a new inode, and ++ store it in xigen file. ++- destroy_inode(): increment aufs inode generation and store it in xigen ++ file. it is necessary even if it is not unlinked, because any data of ++ inode may be changed by UDBA. ++- encode_fh(): for a root dir, simply return FILEID_ROOT. otherwise ++ build file handle by ++ + branch id (4 bytes) ++ + superblock generation (4 bytes) ++ + inode number (4 or 8 bytes) ++ + parent dir inode number (4 or 8 bytes) ++ + inode generation (4 bytes)) ++ + return value of exportfs_encode_fh() for the parent on a branch (4 ++ bytes) ++ + file handle for a branch (by exportfs_encode_fh()) ++- fh_to_dentry(): ++ + find the index of a branch from its id in handle, and check it is ++ still exist in aufs. ++ + 1st level: get the inode number from handle and search it in cache. ++ + 2nd level: if not found in cache, get the parent inode number from ++ the handle and search it in cache. and then open the found parent ++ dir, find the matching inode number by vfs_readdir() and get its ++ name, and call lookup_one_len() for the target dentry. ++ + 3rd level: if the parent dir is not cached, call ++ exportfs_decode_fh() for a branch and get the parent on a branch, ++ build a pathname of it, convert it a pathname in aufs, call ++ path_lookup(). now aufs gets a parent dir dentry, then handle it as ++ the 2nd level. ++ + to open the dir, aufs needs struct vfsmount. aufs keeps vfsmount ++ for every branch, but not itself. to get this, (currently) aufs ++ searches in current->nsproxy->mnt_ns list. it may not be a good ++ idea, but I didn't get other approach. ++ + test the generation of the gotten inode. ++- every inode operation: they may get EBUSY due to UDBA. in this case, ++ convert it into ESTALE for NFSD. ++- readdir(): call lockdep_on/off() because filldir in NFSD calls ++ lookup_one_len(), vfs_getattr(), encode_fh() and others. +diff --git a/Documentation/filesystems/aufs/design/08shwh.txt b/Documentation/filesystems/aufs/design/08shwh.txt +new file mode 100644 +index 0000000..0a22906 +--- /dev/null ++++ b/Documentation/filesystems/aufs/design/08shwh.txt +@@ -0,0 +1,52 @@ ++ ++# Copyright (C) 2005-2016 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++ ++Show Whiteout Mode (shwh) ++---------------------------------------------------------------------- ++Generally aufs hides the name of whiteouts. But in some cases, to show ++them is very useful for users. For instance, creating a new middle layer ++(branch) by merging existing layers. ++ ++(borrowing aufs1 HOW-TO from a user, Michael Towers) ++When you have three branches, ++- Bottom: 'system', squashfs (underlying base system), read-only ++- Middle: 'mods', squashfs, read-only ++- Top: 'overlay', ram (tmpfs), read-write ++ ++The top layer is loaded at boot time and saved at shutdown, to preserve ++the changes made to the system during the session. ++When larger changes have been made, or smaller changes have accumulated, ++the size of the saved top layer data grows. At this point, it would be ++nice to be able to merge the two overlay branches ('mods' and 'overlay') ++and rewrite the 'mods' squashfs, clearing the top layer and thus ++restoring save and load speed. ++ ++This merging is simplified by the use of another aufs mount, of just the ++two overlay branches using the 'shwh' option. ++# mount -t aufs -o ro,shwh,br:/livesys/overlay=ro+wh:/livesys/mods=rr+wh \ ++ aufs /livesys/merge_union ++ ++A merged view of these two branches is then available at ++/livesys/merge_union, and the new feature is that the whiteouts are ++visible! ++Note that in 'shwh' mode the aufs mount must be 'ro', which will disable ++writing to all branches. Also the default mode for all branches is 'ro'. ++It is now possible to save the combined contents of the two overlay ++branches to a new squashfs, e.g.: ++# mksquashfs /livesys/merge_union /path/to/newmods.squash ++ ++This new squashfs archive can be stored on the boot device and the ++initramfs will use it to replace the old one at the next boot. +diff --git a/Documentation/filesystems/aufs/design/10dynop.txt b/Documentation/filesystems/aufs/design/10dynop.txt +new file mode 100644 +index 0000000..4ab46ff +--- /dev/null ++++ b/Documentation/filesystems/aufs/design/10dynop.txt +@@ -0,0 +1,47 @@ ++ ++# Copyright (C) 2010-2016 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++ ++Dynamically customizable FS operations ++---------------------------------------------------------------------- ++Generally FS operations (struct inode_operations, struct ++address_space_operations, struct file_operations, etc.) are defined as ++"static const", but it never means that FS have only one set of ++operation. Some FS have multiple sets of them. For instance, ext2 has ++three sets, one for XIP, for NOBH, and for normal. ++Since aufs overrides and redirects these operations, sometimes aufs has ++to change its behaviour according to the branch FS type. More importantly ++VFS acts differently if a function (member in the struct) is set or ++not. It means aufs should have several sets of operations and select one ++among them according to the branch FS definition. ++ ++In order to solve this problem and not to affect the behaviour of VFS, ++aufs defines these operations dynamically. For instance, aufs defines ++dummy direct_IO function for struct address_space_operations, but it may ++not be set to the address_space_operations actually. When the branch FS ++doesn't have it, aufs doesn't set it to its address_space_operations ++while the function definition itself is still alive. So the behaviour ++itself will not change, and it will return an error when direct_IO is ++not set. ++ ++The lifetime of these dynamically generated operation object is ++maintained by aufs branch object. When the branch is removed from aufs, ++the reference counter of the object is decremented. When it reaches ++zero, the dynamically generated operation object will be freed. ++ ++This approach is designed to support AIO (io_submit), Direct I/O and ++XIP (DAX) mainly. ++Currently this approach is applied to address_space_operations for ++regular files only. +diff --git a/MAINTAINERS b/MAINTAINERS +index 9c567a4..a62aea4 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -2128,6 +2128,19 @@ F: include/linux/audit.h + F: include/uapi/linux/audit.h + F: kernel/audit* + ++AUFS (advanced multi layered unification filesystem) FILESYSTEM ++M: "J. R. Okajima" ++L: linux-unionfs@vger.kernel.org ++L: aufs-users@lists.sourceforge.net (members only) ++W: http://aufs.sourceforge.net ++T: git://github.com/sfjro/aufs4-linux.git ++S: Supported ++F: Documentation/filesystems/aufs/ ++F: Documentation/ABI/testing/debugfs-aufs ++F: Documentation/ABI/testing/sysfs-aufs ++F: fs/aufs/ ++F: include/uapi/linux/aufs_type.h ++ + AUXILIARY DISPLAY DRIVERS + M: Miguel Ojeda Sandonis + W: http://miguelojeda.es/auxdisplay.htm +diff --git a/drivers/block/loop.c b/drivers/block/loop.c +index 80cf8ad..ba9e4a7 100644 +--- a/drivers/block/loop.c ++++ b/drivers/block/loop.c +@@ -712,6 +712,24 @@ static inline int is_loop_device(struct file *file) + return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR; + } + ++/* ++ * for AUFS ++ * no get/put for file. ++ */ ++struct file *loop_backing_file(struct super_block *sb) ++{ ++ struct file *ret; ++ struct loop_device *l; ++ ++ ret = NULL; ++ if (MAJOR(sb->s_dev) == LOOP_MAJOR) { ++ l = sb->s_bdev->bd_disk->private_data; ++ ret = l->lo_backing_file; ++ } ++ return ret; ++} ++EXPORT_SYMBOL_GPL(loop_backing_file); ++ + /* loop sysfs attributes */ + + static ssize_t loop_attr_show(struct device *dev, char *page, +diff --git a/fs/Kconfig b/fs/Kconfig +index 6725f59..2ced198 100644 +--- a/fs/Kconfig ++++ b/fs/Kconfig +@@ -235,6 +235,7 @@ source "fs/pstore/Kconfig" + source "fs/sysv/Kconfig" + source "fs/ufs/Kconfig" + source "fs/exofs/Kconfig" ++source "fs/aufs/Kconfig" + + endif # MISC_FILESYSTEMS + +diff --git a/fs/Makefile b/fs/Makefile +index 85b6e13..e7bb164 100644 +--- a/fs/Makefile ++++ b/fs/Makefile +@@ -128,3 +128,4 @@ obj-y += exofs/ # Multiple modules + obj-$(CONFIG_CEPH_FS) += ceph/ + obj-$(CONFIG_PSTORE) += pstore/ + obj-$(CONFIG_EFIVAR_FS) += efivarfs/ ++obj-$(CONFIG_AUFS_FS) += aufs/ +diff --git a/fs/aufs/Kconfig b/fs/aufs/Kconfig +new file mode 100644 +index 0000000..63560ce +--- /dev/null ++++ b/fs/aufs/Kconfig +@@ -0,0 +1,185 @@ ++config AUFS_FS ++ tristate "Aufs (Advanced multi layered unification filesystem) support" ++ help ++ Aufs is a stackable unification filesystem such as Unionfs, ++ which unifies several directories and provides a merged single ++ directory. ++ In the early days, aufs was entirely re-designed and ++ re-implemented Unionfs Version 1.x series. Introducing many ++ original ideas, approaches and improvements, it becomes totally ++ different from Unionfs while keeping the basic features. ++ ++if AUFS_FS ++choice ++ prompt "Maximum number of branches" ++ default AUFS_BRANCH_MAX_127 ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++config AUFS_BRANCH_MAX_127 ++ bool "127" ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++config AUFS_BRANCH_MAX_511 ++ bool "511" ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++config AUFS_BRANCH_MAX_1023 ++ bool "1023" ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++config AUFS_BRANCH_MAX_32767 ++ bool "32767" ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++endchoice ++ ++config AUFS_SBILIST ++ bool ++ depends on AUFS_MAGIC_SYSRQ || PROC_FS ++ default y ++ help ++ Automatic configuration for internal use. ++ When aufs supports Magic SysRq or /proc, enabled automatically. ++ ++config AUFS_HNOTIFY ++ bool "Detect direct branch access (bypassing aufs)" ++ help ++ If you want to modify files on branches directly, eg. bypassing aufs, ++ and want aufs to detect the changes of them fully, then enable this ++ option and use 'udba=notify' mount option. ++ Currently there is only one available configuration, "fsnotify". ++ It will have a negative impact to the performance. ++ See detail in aufs.5. ++ ++choice ++ prompt "method" if AUFS_HNOTIFY ++ default AUFS_HFSNOTIFY ++config AUFS_HFSNOTIFY ++ bool "fsnotify" ++ select FSNOTIFY ++endchoice ++ ++config AUFS_EXPORT ++ bool "NFS-exportable aufs" ++ depends on EXPORTFS ++ help ++ If you want to export your mounted aufs via NFS, then enable this ++ option. There are several requirements for this configuration. ++ See detail in aufs.5. ++ ++config AUFS_INO_T_64 ++ bool ++ depends on AUFS_EXPORT ++ depends on 64BIT && !(ALPHA || S390) ++ default y ++ help ++ Automatic configuration for internal use. ++ /* typedef unsigned long/int __kernel_ino_t */ ++ /* alpha and s390x are int */ ++ ++config AUFS_XATTR ++ bool "support for XATTR/EA (including Security Labels)" ++ help ++ If your branch fs supports XATTR/EA and you want to make them ++ available in aufs too, then enable this opsion and specify the ++ branch attributes for EA. ++ See detail in aufs.5. ++ ++config AUFS_FHSM ++ bool "File-based Hierarchical Storage Management" ++ help ++ Hierarchical Storage Management (or HSM) is a well-known feature ++ in the storage world. Aufs provides this feature as file-based. ++ with multiple branches. ++ These multiple branches are prioritized, ie. the topmost one ++ should be the fastest drive and be used heavily. ++ ++config AUFS_RDU ++ bool "Readdir in userspace" ++ help ++ Aufs has two methods to provide a merged view for a directory, ++ by a user-space library and by kernel-space natively. The latter ++ is always enabled but sometimes large and slow. ++ If you enable this option, install the library in aufs2-util ++ package, and set some environment variables for your readdir(3), ++ then the work will be handled in user-space which generally ++ shows better performance in most cases. ++ See detail in aufs.5. ++ ++config AUFS_SHWH ++ bool "Show whiteouts" ++ help ++ If you want to make the whiteouts in aufs visible, then enable ++ this option and specify 'shwh' mount option. Although it may ++ sounds like philosophy or something, but in technically it ++ simply shows the name of whiteout with keeping its behaviour. ++ ++config AUFS_BR_RAMFS ++ bool "Ramfs (initramfs/rootfs) as an aufs branch" ++ help ++ If you want to use ramfs as an aufs branch fs, then enable this ++ option. Generally tmpfs is recommended. ++ Aufs prohibited them to be a branch fs by default, because ++ initramfs becomes unusable after switch_root or something ++ generally. If you sets initramfs as an aufs branch and boot your ++ system by switch_root, you will meet a problem easily since the ++ files in initramfs may be inaccessible. ++ Unless you are going to use ramfs as an aufs branch fs without ++ switch_root or something, leave it N. ++ ++config AUFS_BR_FUSE ++ bool "Fuse fs as an aufs branch" ++ depends on FUSE_FS ++ select AUFS_POLL ++ help ++ If you want to use fuse-based userspace filesystem as an aufs ++ branch fs, then enable this option. ++ It implements the internal poll(2) operation which is ++ implemented by fuse only (curretnly). ++ ++config AUFS_POLL ++ bool ++ help ++ Automatic configuration for internal use. ++ ++config AUFS_BR_HFSPLUS ++ bool "Hfsplus as an aufs branch" ++ depends on HFSPLUS_FS ++ default y ++ help ++ If you want to use hfsplus fs as an aufs branch fs, then enable ++ this option. This option introduces a small overhead at ++ copying-up a file on hfsplus. ++ ++config AUFS_BDEV_LOOP ++ bool ++ depends on BLK_DEV_LOOP ++ default y ++ help ++ Automatic configuration for internal use. ++ Convert =[ym] into =y. ++ ++config AUFS_DEBUG ++ bool "Debug aufs" ++ help ++ Enable this to compile aufs internal debug code. ++ It will have a negative impact to the performance. ++ ++config AUFS_MAGIC_SYSRQ ++ bool ++ depends on AUFS_DEBUG && MAGIC_SYSRQ ++ default y ++ help ++ Automatic configuration for internal use. ++ When aufs supports Magic SysRq, enabled automatically. ++endif +diff --git a/fs/aufs/Makefile b/fs/aufs/Makefile +new file mode 100644 +index 0000000..c7a501e +--- /dev/null ++++ b/fs/aufs/Makefile +@@ -0,0 +1,44 @@ ++ ++include ${src}/magic.mk ++ifeq (${CONFIG_AUFS_FS},m) ++include ${src}/conf.mk ++endif ++-include ${src}/priv_def.mk ++ ++# cf. include/linux/kernel.h ++# enable pr_debug ++ccflags-y += -DDEBUG ++# sparse requires the full pathname ++ifdef M ++ccflags-y += -include ${M}/../../include/uapi/linux/aufs_type.h ++else ++ccflags-y += -include ${srctree}/include/uapi/linux/aufs_type.h ++endif ++ ++obj-$(CONFIG_AUFS_FS) += aufs.o ++aufs-y := module.o sbinfo.o super.o branch.o xino.o sysaufs.o opts.o \ ++ wkq.o vfsub.o dcsub.o \ ++ cpup.o whout.o wbr_policy.o \ ++ dinfo.o dentry.o \ ++ dynop.o \ ++ finfo.o file.o f_op.o \ ++ dir.o vdir.o \ ++ iinfo.o inode.o i_op.o i_op_add.o i_op_del.o i_op_ren.o \ ++ mvdown.o ioctl.o ++ ++# all are boolean ++aufs-$(CONFIG_PROC_FS) += procfs.o plink.o ++aufs-$(CONFIG_SYSFS) += sysfs.o ++aufs-$(CONFIG_DEBUG_FS) += dbgaufs.o ++aufs-$(CONFIG_AUFS_BDEV_LOOP) += loop.o ++aufs-$(CONFIG_AUFS_HNOTIFY) += hnotify.o ++aufs-$(CONFIG_AUFS_HFSNOTIFY) += hfsnotify.o ++aufs-$(CONFIG_AUFS_EXPORT) += export.o ++aufs-$(CONFIG_AUFS_XATTR) += xattr.o ++aufs-$(CONFIG_FS_POSIX_ACL) += posix_acl.o ++aufs-$(CONFIG_AUFS_FHSM) += fhsm.o ++aufs-$(CONFIG_AUFS_POLL) += poll.o ++aufs-$(CONFIG_AUFS_RDU) += rdu.o ++aufs-$(CONFIG_AUFS_BR_HFSPLUS) += hfsplus.o ++aufs-$(CONFIG_AUFS_DEBUG) += debug.o ++aufs-$(CONFIG_AUFS_MAGIC_SYSRQ) += sysrq.o +diff --git a/fs/aufs/aufs.h b/fs/aufs/aufs.h +new file mode 100644 +index 0000000..e48d268 +--- /dev/null ++++ b/fs/aufs/aufs.h +@@ -0,0 +1,59 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * all header files ++ */ ++ ++#ifndef __AUFS_H__ ++#define __AUFS_H__ ++ ++#ifdef __KERNEL__ ++ ++#define AuStub(type, name, body, ...) \ ++ static inline type name(__VA_ARGS__) { body; } ++ ++#define AuStubVoid(name, ...) \ ++ AuStub(void, name, , __VA_ARGS__) ++#define AuStubInt0(name, ...) \ ++ AuStub(int, name, return 0, __VA_ARGS__) ++ ++#include "debug.h" ++ ++#include "branch.h" ++#include "cpup.h" ++#include "dcsub.h" ++#include "dbgaufs.h" ++#include "dentry.h" ++#include "dir.h" ++#include "dynop.h" ++#include "file.h" ++#include "fstype.h" ++#include "inode.h" ++#include "loop.h" ++#include "module.h" ++#include "opts.h" ++#include "rwsem.h" ++#include "spl.h" ++#include "super.h" ++#include "sysaufs.h" ++#include "vfsub.h" ++#include "whout.h" ++#include "wkq.h" ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_H__ */ +diff --git a/fs/aufs/branch.c b/fs/aufs/branch.c +new file mode 100644 +index 0000000..9b1f7c8 +--- /dev/null ++++ b/fs/aufs/branch.c +@@ -0,0 +1,1406 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * branch management ++ */ ++ ++#include ++#include ++#include "aufs.h" ++ ++/* ++ * free a single branch ++ */ ++static void au_br_do_free(struct au_branch *br) ++{ ++ int i; ++ struct au_wbr *wbr; ++ struct au_dykey **key; ++ ++ au_hnotify_fin_br(br); ++ ++ if (br->br_xino.xi_file) ++ fput(br->br_xino.xi_file); ++ mutex_destroy(&br->br_xino.xi_nondir_mtx); ++ ++ AuDebugOn(au_br_count(br)); ++ au_br_count_fin(br); ++ ++ wbr = br->br_wbr; ++ if (wbr) { ++ for (i = 0; i < AuBrWh_Last; i++) ++ dput(wbr->wbr_wh[i]); ++ AuDebugOn(atomic_read(&wbr->wbr_wh_running)); ++ AuRwDestroy(&wbr->wbr_wh_rwsem); ++ } ++ ++ if (br->br_fhsm) { ++ au_br_fhsm_fin(br->br_fhsm); ++ kfree(br->br_fhsm); ++ } ++ ++ key = br->br_dykey; ++ for (i = 0; i < AuBrDynOp; i++, key++) ++ if (*key) ++ au_dy_put(*key); ++ else ++ break; ++ ++ /* recursive lock, s_umount of branch's */ ++ lockdep_off(); ++ path_put(&br->br_path); ++ lockdep_on(); ++ kfree(wbr); ++ kfree(br); ++} ++ ++/* ++ * frees all branches ++ */ ++void au_br_free(struct au_sbinfo *sbinfo) ++{ ++ aufs_bindex_t bmax; ++ struct au_branch **br; ++ ++ AuRwMustWriteLock(&sbinfo->si_rwsem); ++ ++ bmax = sbinfo->si_bbot + 1; ++ br = sbinfo->si_branch; ++ while (bmax--) ++ au_br_do_free(*br++); ++} ++ ++/* ++ * find the index of a branch which is specified by @br_id. ++ */ ++int au_br_index(struct super_block *sb, aufs_bindex_t br_id) ++{ ++ aufs_bindex_t bindex, bbot; ++ ++ bbot = au_sbbot(sb); ++ for (bindex = 0; bindex <= bbot; bindex++) ++ if (au_sbr_id(sb, bindex) == br_id) ++ return bindex; ++ return -1; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * add a branch ++ */ ++ ++static int test_overlap(struct super_block *sb, struct dentry *h_adding, ++ struct dentry *h_root) ++{ ++ if (unlikely(h_adding == h_root ++ || au_test_loopback_overlap(sb, h_adding))) ++ return 1; ++ if (h_adding->d_sb != h_root->d_sb) ++ return 0; ++ return au_test_subdir(h_adding, h_root) ++ || au_test_subdir(h_root, h_adding); ++} ++ ++/* ++ * returns a newly allocated branch. @new_nbranch is a number of branches ++ * after adding a branch. ++ */ ++static struct au_branch *au_br_alloc(struct super_block *sb, int new_nbranch, ++ int perm) ++{ ++ struct au_branch *add_branch; ++ struct dentry *root; ++ struct inode *inode; ++ int err; ++ ++ err = -ENOMEM; ++ root = sb->s_root; ++ add_branch = kzalloc(sizeof(*add_branch), GFP_NOFS); ++ if (unlikely(!add_branch)) ++ goto out; ++ ++ err = au_hnotify_init_br(add_branch, perm); ++ if (unlikely(err)) ++ goto out_br; ++ ++ if (au_br_writable(perm)) { ++ /* may be freed separately at changing the branch permission */ ++ add_branch->br_wbr = kzalloc(sizeof(*add_branch->br_wbr), ++ GFP_NOFS); ++ if (unlikely(!add_branch->br_wbr)) ++ goto out_hnotify; ++ } ++ ++ if (au_br_fhsm(perm)) { ++ err = au_fhsm_br_alloc(add_branch); ++ if (unlikely(err)) ++ goto out_wbr; ++ } ++ ++ err = au_sbr_realloc(au_sbi(sb), new_nbranch); ++ if (!err) ++ err = au_di_realloc(au_di(root), new_nbranch); ++ if (!err) { ++ inode = d_inode(root); ++ err = au_hinode_realloc(au_ii(inode), new_nbranch); ++ } ++ if (!err) ++ return add_branch; /* success */ ++ ++out_wbr: ++ kfree(add_branch->br_wbr); ++out_hnotify: ++ au_hnotify_fin_br(add_branch); ++out_br: ++ kfree(add_branch); ++out: ++ return ERR_PTR(err); ++} ++ ++/* ++ * test if the branch permission is legal or not. ++ */ ++static int test_br(struct inode *inode, int brperm, char *path) ++{ ++ int err; ++ ++ err = (au_br_writable(brperm) && IS_RDONLY(inode)); ++ if (!err) ++ goto out; ++ ++ err = -EINVAL; ++ pr_err("write permission for readonly mount or inode, %s\n", path); ++ ++out: ++ return err; ++} ++ ++/* ++ * returns: ++ * 0: success, the caller will add it ++ * plus: success, it is already unified, the caller should ignore it ++ * minus: error ++ */ ++static int test_add(struct super_block *sb, struct au_opt_add *add, int remount) ++{ ++ int err; ++ aufs_bindex_t bbot, bindex; ++ struct dentry *root, *h_dentry; ++ struct inode *inode, *h_inode; ++ ++ root = sb->s_root; ++ bbot = au_sbbot(sb); ++ if (unlikely(bbot >= 0 ++ && au_find_dbindex(root, add->path.dentry) >= 0)) { ++ err = 1; ++ if (!remount) { ++ err = -EINVAL; ++ pr_err("%s duplicated\n", add->pathname); ++ } ++ goto out; ++ } ++ ++ err = -ENOSPC; /* -E2BIG; */ ++ if (unlikely(AUFS_BRANCH_MAX <= add->bindex ++ || AUFS_BRANCH_MAX - 1 <= bbot)) { ++ pr_err("number of branches exceeded %s\n", add->pathname); ++ goto out; ++ } ++ ++ err = -EDOM; ++ if (unlikely(add->bindex < 0 || bbot + 1 < add->bindex)) { ++ pr_err("bad index %d\n", add->bindex); ++ goto out; ++ } ++ ++ inode = d_inode(add->path.dentry); ++ err = -ENOENT; ++ if (unlikely(!inode->i_nlink)) { ++ pr_err("no existence %s\n", add->pathname); ++ goto out; ++ } ++ ++ err = -EINVAL; ++ if (unlikely(inode->i_sb == sb)) { ++ pr_err("%s must be outside\n", add->pathname); ++ goto out; ++ } ++ ++ if (unlikely(au_test_fs_unsuppoted(inode->i_sb))) { ++ pr_err("unsupported filesystem, %s (%s)\n", ++ add->pathname, au_sbtype(inode->i_sb)); ++ goto out; ++ } ++ ++ if (unlikely(inode->i_sb->s_stack_depth)) { ++ pr_err("already stacked, %s (%s)\n", ++ add->pathname, au_sbtype(inode->i_sb)); ++ goto out; ++ } ++ ++ err = test_br(d_inode(add->path.dentry), add->perm, add->pathname); ++ if (unlikely(err)) ++ goto out; ++ ++ if (bbot < 0) ++ return 0; /* success */ ++ ++ err = -EINVAL; ++ for (bindex = 0; bindex <= bbot; bindex++) ++ if (unlikely(test_overlap(sb, add->path.dentry, ++ au_h_dptr(root, bindex)))) { ++ pr_err("%s is overlapped\n", add->pathname); ++ goto out; ++ } ++ ++ err = 0; ++ if (au_opt_test(au_mntflags(sb), WARN_PERM)) { ++ h_dentry = au_h_dptr(root, 0); ++ h_inode = d_inode(h_dentry); ++ if ((h_inode->i_mode & S_IALLUGO) != (inode->i_mode & S_IALLUGO) ++ || !uid_eq(h_inode->i_uid, inode->i_uid) ++ || !gid_eq(h_inode->i_gid, inode->i_gid)) ++ pr_warn("uid/gid/perm %s %u/%u/0%o, %u/%u/0%o\n", ++ add->pathname, ++ i_uid_read(inode), i_gid_read(inode), ++ (inode->i_mode & S_IALLUGO), ++ i_uid_read(h_inode), i_gid_read(h_inode), ++ (h_inode->i_mode & S_IALLUGO)); ++ } ++ ++out: ++ return err; ++} ++ ++/* ++ * initialize or clean the whiteouts for an adding branch ++ */ ++static int au_br_init_wh(struct super_block *sb, struct au_branch *br, ++ int new_perm) ++{ ++ int err, old_perm; ++ aufs_bindex_t bindex; ++ struct inode *h_inode; ++ struct au_wbr *wbr; ++ struct au_hinode *hdir; ++ struct dentry *h_dentry; ++ ++ err = vfsub_mnt_want_write(au_br_mnt(br)); ++ if (unlikely(err)) ++ goto out; ++ ++ wbr = br->br_wbr; ++ old_perm = br->br_perm; ++ br->br_perm = new_perm; ++ hdir = NULL; ++ h_inode = NULL; ++ bindex = au_br_index(sb, br->br_id); ++ if (0 <= bindex) { ++ hdir = au_hi(d_inode(sb->s_root), bindex); ++ au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ } else { ++ h_dentry = au_br_dentry(br); ++ h_inode = d_inode(h_dentry); ++ inode_lock_nested(h_inode, AuLsc_I_PARENT); ++ } ++ if (!wbr) ++ err = au_wh_init(br, sb); ++ else { ++ wbr_wh_write_lock(wbr); ++ err = au_wh_init(br, sb); ++ wbr_wh_write_unlock(wbr); ++ } ++ if (hdir) ++ au_hn_imtx_unlock(hdir); ++ else ++ inode_unlock(h_inode); ++ vfsub_mnt_drop_write(au_br_mnt(br)); ++ br->br_perm = old_perm; ++ ++ if (!err && wbr && !au_br_writable(new_perm)) { ++ kfree(wbr); ++ br->br_wbr = NULL; ++ } ++ ++out: ++ return err; ++} ++ ++static int au_wbr_init(struct au_branch *br, struct super_block *sb, ++ int perm) ++{ ++ int err; ++ struct kstatfs kst; ++ struct au_wbr *wbr; ++ ++ wbr = br->br_wbr; ++ au_rw_init(&wbr->wbr_wh_rwsem); ++ atomic_set(&wbr->wbr_wh_running, 0); ++ ++ /* ++ * a limit for rmdir/rename a dir ++ * cf. AUFS_MAX_NAMELEN in include/uapi/linux/aufs_type.h ++ */ ++ err = vfs_statfs(&br->br_path, &kst); ++ if (unlikely(err)) ++ goto out; ++ err = -EINVAL; ++ if (kst.f_namelen >= NAME_MAX) ++ err = au_br_init_wh(sb, br, perm); ++ else ++ pr_err("%pd(%s), unsupported namelen %ld\n", ++ au_br_dentry(br), ++ au_sbtype(au_br_dentry(br)->d_sb), kst.f_namelen); ++ ++out: ++ return err; ++} ++ ++/* initialize a new branch */ ++static int au_br_init(struct au_branch *br, struct super_block *sb, ++ struct au_opt_add *add) ++{ ++ int err; ++ struct inode *h_inode; ++ ++ err = 0; ++ mutex_init(&br->br_xino.xi_nondir_mtx); ++ br->br_perm = add->perm; ++ br->br_path = add->path; /* set first, path_get() later */ ++ spin_lock_init(&br->br_dykey_lock); ++ au_br_count_init(br); ++ atomic_set(&br->br_xino_running, 0); ++ br->br_id = au_new_br_id(sb); ++ AuDebugOn(br->br_id < 0); ++ ++ if (au_br_writable(add->perm)) { ++ err = au_wbr_init(br, sb, add->perm); ++ if (unlikely(err)) ++ goto out_err; ++ } ++ ++ if (au_opt_test(au_mntflags(sb), XINO)) { ++ h_inode = d_inode(add->path.dentry); ++ err = au_xino_br(sb, br, h_inode->i_ino, ++ au_sbr(sb, 0)->br_xino.xi_file, /*do_test*/1); ++ if (unlikely(err)) { ++ AuDebugOn(br->br_xino.xi_file); ++ goto out_err; ++ } ++ } ++ ++ sysaufs_br_init(br); ++ path_get(&br->br_path); ++ goto out; /* success */ ++ ++out_err: ++ memset(&br->br_path, 0, sizeof(br->br_path)); ++out: ++ return err; ++} ++ ++static void au_br_do_add_brp(struct au_sbinfo *sbinfo, aufs_bindex_t bindex, ++ struct au_branch *br, aufs_bindex_t bbot, ++ aufs_bindex_t amount) ++{ ++ struct au_branch **brp; ++ ++ AuRwMustWriteLock(&sbinfo->si_rwsem); ++ ++ brp = sbinfo->si_branch + bindex; ++ memmove(brp + 1, brp, sizeof(*brp) * amount); ++ *brp = br; ++ sbinfo->si_bbot++; ++ if (unlikely(bbot < 0)) ++ sbinfo->si_bbot = 0; ++} ++ ++static void au_br_do_add_hdp(struct au_dinfo *dinfo, aufs_bindex_t bindex, ++ aufs_bindex_t bbot, aufs_bindex_t amount) ++{ ++ struct au_hdentry *hdp; ++ ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ hdp = dinfo->di_hdentry + bindex; ++ memmove(hdp + 1, hdp, sizeof(*hdp) * amount); ++ au_h_dentry_init(hdp); ++ dinfo->di_bbot++; ++ if (unlikely(bbot < 0)) ++ dinfo->di_btop = 0; ++} ++ ++static void au_br_do_add_hip(struct au_iinfo *iinfo, aufs_bindex_t bindex, ++ aufs_bindex_t bbot, aufs_bindex_t amount) ++{ ++ struct au_hinode *hip; ++ ++ AuRwMustWriteLock(&iinfo->ii_rwsem); ++ ++ hip = au_hinode(iinfo, bindex); ++ memmove(hip + 1, hip, sizeof(*hip) * amount); ++ au_hinode_init(hip); ++ iinfo->ii_bbot++; ++ if (unlikely(bbot < 0)) ++ iinfo->ii_btop = 0; ++} ++ ++static void au_br_do_add(struct super_block *sb, struct au_branch *br, ++ aufs_bindex_t bindex) ++{ ++ struct dentry *root, *h_dentry; ++ struct inode *root_inode, *h_inode; ++ aufs_bindex_t bbot, amount; ++ ++ root = sb->s_root; ++ root_inode = d_inode(root); ++ bbot = au_sbbot(sb); ++ amount = bbot + 1 - bindex; ++ h_dentry = au_br_dentry(br); ++ au_sbilist_lock(); ++ au_br_do_add_brp(au_sbi(sb), bindex, br, bbot, amount); ++ au_br_do_add_hdp(au_di(root), bindex, bbot, amount); ++ au_br_do_add_hip(au_ii(root_inode), bindex, bbot, amount); ++ au_set_h_dptr(root, bindex, dget(h_dentry)); ++ h_inode = d_inode(h_dentry); ++ au_set_h_iptr(root_inode, bindex, au_igrab(h_inode), /*flags*/0); ++ au_sbilist_unlock(); ++} ++ ++int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount) ++{ ++ int err; ++ aufs_bindex_t bbot, add_bindex; ++ struct dentry *root, *h_dentry; ++ struct inode *root_inode; ++ struct au_branch *add_branch; ++ ++ root = sb->s_root; ++ root_inode = d_inode(root); ++ IMustLock(root_inode); ++ err = test_add(sb, add, remount); ++ if (unlikely(err < 0)) ++ goto out; ++ if (err) { ++ err = 0; ++ goto out; /* success */ ++ } ++ ++ bbot = au_sbbot(sb); ++ add_branch = au_br_alloc(sb, bbot + 2, add->perm); ++ err = PTR_ERR(add_branch); ++ if (IS_ERR(add_branch)) ++ goto out; ++ ++ err = au_br_init(add_branch, sb, add); ++ if (unlikely(err)) { ++ au_br_do_free(add_branch); ++ goto out; ++ } ++ ++ add_bindex = add->bindex; ++ if (!remount) ++ au_br_do_add(sb, add_branch, add_bindex); ++ else { ++ sysaufs_brs_del(sb, add_bindex); ++ au_br_do_add(sb, add_branch, add_bindex); ++ sysaufs_brs_add(sb, add_bindex); ++ } ++ ++ h_dentry = add->path.dentry; ++ if (!add_bindex) { ++ au_cpup_attr_all(root_inode, /*force*/1); ++ sb->s_maxbytes = h_dentry->d_sb->s_maxbytes; ++ } else ++ au_add_nlink(root_inode, d_inode(h_dentry)); ++ ++ /* ++ * this test/set prevents aufs from handling unnecesary notify events ++ * of xino files, in case of re-adding a writable branch which was ++ * once detached from aufs. ++ */ ++ if (au_xino_brid(sb) < 0 ++ && au_br_writable(add_branch->br_perm) ++ && !au_test_fs_bad_xino(h_dentry->d_sb) ++ && add_branch->br_xino.xi_file ++ && add_branch->br_xino.xi_file->f_path.dentry->d_parent == h_dentry) ++ au_xino_brid_set(sb, add_branch->br_id); ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static unsigned long long au_farray_cb(struct super_block *sb, void *a, ++ unsigned long long max __maybe_unused, ++ void *arg) ++{ ++ unsigned long long n; ++ struct file **p, *f; ++ struct au_sphlhead *files; ++ struct au_finfo *finfo; ++ ++ n = 0; ++ p = a; ++ files = &au_sbi(sb)->si_files; ++ spin_lock(&files->spin); ++ hlist_for_each_entry(finfo, &files->head, fi_hlist) { ++ f = finfo->fi_file; ++ if (file_count(f) ++ && !special_file(file_inode(f)->i_mode)) { ++ get_file(f); ++ *p++ = f; ++ n++; ++ AuDebugOn(n > max); ++ } ++ } ++ spin_unlock(&files->spin); ++ ++ return n; ++} ++ ++static struct file **au_farray_alloc(struct super_block *sb, ++ unsigned long long *max) ++{ ++ *max = au_nfiles(sb); ++ return au_array_alloc(max, au_farray_cb, sb, /*arg*/NULL); ++} ++ ++static void au_farray_free(struct file **a, unsigned long long max) ++{ ++ unsigned long long ull; ++ ++ for (ull = 0; ull < max; ull++) ++ if (a[ull]) ++ fput(a[ull]); ++ kvfree(a); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * delete a branch ++ */ ++ ++/* to show the line number, do not make it inlined function */ ++#define AuVerbose(do_info, fmt, ...) do { \ ++ if (do_info) \ ++ pr_info(fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++static int au_test_ibusy(struct inode *inode, aufs_bindex_t btop, ++ aufs_bindex_t bbot) ++{ ++ return (inode && !S_ISDIR(inode->i_mode)) || btop == bbot; ++} ++ ++static int au_test_dbusy(struct dentry *dentry, aufs_bindex_t btop, ++ aufs_bindex_t bbot) ++{ ++ return au_test_ibusy(d_inode(dentry), btop, bbot); ++} ++ ++/* ++ * test if the branch is deletable or not. ++ */ ++static int test_dentry_busy(struct dentry *root, aufs_bindex_t bindex, ++ unsigned int sigen, const unsigned int verbose) ++{ ++ int err, i, j, ndentry; ++ aufs_bindex_t btop, bbot; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry *d; ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_dcsub_pages(&dpages, root, NULL, NULL); ++ if (unlikely(err)) ++ goto out_dpages; ++ ++ for (i = 0; !err && i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ ndentry = dpage->ndentry; ++ for (j = 0; !err && j < ndentry; j++) { ++ d = dpage->dentries[j]; ++ AuDebugOn(au_dcount(d) <= 0); ++ if (!au_digen_test(d, sigen)) { ++ di_read_lock_child(d, AuLock_IR); ++ if (unlikely(au_dbrange_test(d))) { ++ di_read_unlock(d, AuLock_IR); ++ continue; ++ } ++ } else { ++ di_write_lock_child(d); ++ if (unlikely(au_dbrange_test(d))) { ++ di_write_unlock(d); ++ continue; ++ } ++ err = au_reval_dpath(d, sigen); ++ if (!err) ++ di_downgrade_lock(d, AuLock_IR); ++ else { ++ di_write_unlock(d); ++ break; ++ } ++ } ++ ++ /* AuDbgDentry(d); */ ++ btop = au_dbtop(d); ++ bbot = au_dbbot(d); ++ if (btop <= bindex ++ && bindex <= bbot ++ && au_h_dptr(d, bindex) ++ && au_test_dbusy(d, btop, bbot)) { ++ err = -EBUSY; ++ AuVerbose(verbose, "busy %pd\n", d); ++ AuDbgDentry(d); ++ } ++ di_read_unlock(d, AuLock_IR); ++ } ++ } ++ ++out_dpages: ++ au_dpages_free(&dpages); ++out: ++ return err; ++} ++ ++static int test_inode_busy(struct super_block *sb, aufs_bindex_t bindex, ++ unsigned int sigen, const unsigned int verbose) ++{ ++ int err; ++ unsigned long long max, ull; ++ struct inode *i, **array; ++ aufs_bindex_t btop, bbot; ++ ++ array = au_iarray_alloc(sb, &max); ++ err = PTR_ERR(array); ++ if (IS_ERR(array)) ++ goto out; ++ ++ err = 0; ++ AuDbg("b%d\n", bindex); ++ for (ull = 0; !err && ull < max; ull++) { ++ i = array[ull]; ++ if (unlikely(!i)) ++ break; ++ if (i->i_ino == AUFS_ROOT_INO) ++ continue; ++ ++ /* AuDbgInode(i); */ ++ if (au_iigen(i, NULL) == sigen) ++ ii_read_lock_child(i); ++ else { ++ ii_write_lock_child(i); ++ err = au_refresh_hinode_self(i); ++ au_iigen_dec(i); ++ if (!err) ++ ii_downgrade_lock(i); ++ else { ++ ii_write_unlock(i); ++ break; ++ } ++ } ++ ++ btop = au_ibtop(i); ++ bbot = au_ibbot(i); ++ if (btop <= bindex ++ && bindex <= bbot ++ && au_h_iptr(i, bindex) ++ && au_test_ibusy(i, btop, bbot)) { ++ err = -EBUSY; ++ AuVerbose(verbose, "busy i%lu\n", i->i_ino); ++ AuDbgInode(i); ++ } ++ ii_read_unlock(i); ++ } ++ au_iarray_free(array, max); ++ ++out: ++ return err; ++} ++ ++static int test_children_busy(struct dentry *root, aufs_bindex_t bindex, ++ const unsigned int verbose) ++{ ++ int err; ++ unsigned int sigen; ++ ++ sigen = au_sigen(root->d_sb); ++ DiMustNoWaiters(root); ++ IiMustNoWaiters(d_inode(root)); ++ di_write_unlock(root); ++ err = test_dentry_busy(root, bindex, sigen, verbose); ++ if (!err) ++ err = test_inode_busy(root->d_sb, bindex, sigen, verbose); ++ di_write_lock_child(root); /* aufs_write_lock() calls ..._child() */ ++ ++ return err; ++} ++ ++static int test_dir_busy(struct file *file, aufs_bindex_t br_id, ++ struct file **to_free, int *idx) ++{ ++ int err; ++ unsigned char matched, root; ++ aufs_bindex_t bindex, bbot; ++ struct au_fidir *fidir; ++ struct au_hfile *hfile; ++ ++ err = 0; ++ root = IS_ROOT(file->f_path.dentry); ++ if (root) { ++ get_file(file); ++ to_free[*idx] = file; ++ (*idx)++; ++ goto out; ++ } ++ ++ matched = 0; ++ fidir = au_fi(file)->fi_hdir; ++ AuDebugOn(!fidir); ++ bbot = au_fbbot_dir(file); ++ for (bindex = au_fbtop(file); bindex <= bbot; bindex++) { ++ hfile = fidir->fd_hfile + bindex; ++ if (!hfile->hf_file) ++ continue; ++ ++ if (hfile->hf_br->br_id == br_id) { ++ matched = 1; ++ break; ++ } ++ } ++ if (matched) ++ err = -EBUSY; ++ ++out: ++ return err; ++} ++ ++static int test_file_busy(struct super_block *sb, aufs_bindex_t br_id, ++ struct file **to_free, int opened) ++{ ++ int err, idx; ++ unsigned long long ull, max; ++ aufs_bindex_t btop; ++ struct file *file, **array; ++ struct dentry *root; ++ struct au_hfile *hfile; ++ ++ array = au_farray_alloc(sb, &max); ++ err = PTR_ERR(array); ++ if (IS_ERR(array)) ++ goto out; ++ ++ err = 0; ++ idx = 0; ++ root = sb->s_root; ++ di_write_unlock(root); ++ for (ull = 0; ull < max; ull++) { ++ file = array[ull]; ++ if (unlikely(!file)) ++ break; ++ ++ /* AuDbg("%pD\n", file); */ ++ fi_read_lock(file); ++ btop = au_fbtop(file); ++ if (!d_is_dir(file->f_path.dentry)) { ++ hfile = &au_fi(file)->fi_htop; ++ if (hfile->hf_br->br_id == br_id) ++ err = -EBUSY; ++ } else ++ err = test_dir_busy(file, br_id, to_free, &idx); ++ fi_read_unlock(file); ++ if (unlikely(err)) ++ break; ++ } ++ di_write_lock_child(root); ++ au_farray_free(array, max); ++ AuDebugOn(idx > opened); ++ ++out: ++ return err; ++} ++ ++static void br_del_file(struct file **to_free, unsigned long long opened, ++ aufs_bindex_t br_id) ++{ ++ unsigned long long ull; ++ aufs_bindex_t bindex, btop, bbot, bfound; ++ struct file *file; ++ struct au_fidir *fidir; ++ struct au_hfile *hfile; ++ ++ for (ull = 0; ull < opened; ull++) { ++ file = to_free[ull]; ++ if (unlikely(!file)) ++ break; ++ ++ /* AuDbg("%pD\n", file); */ ++ AuDebugOn(!d_is_dir(file->f_path.dentry)); ++ bfound = -1; ++ fidir = au_fi(file)->fi_hdir; ++ AuDebugOn(!fidir); ++ fi_write_lock(file); ++ btop = au_fbtop(file); ++ bbot = au_fbbot_dir(file); ++ for (bindex = btop; bindex <= bbot; bindex++) { ++ hfile = fidir->fd_hfile + bindex; ++ if (!hfile->hf_file) ++ continue; ++ ++ if (hfile->hf_br->br_id == br_id) { ++ bfound = bindex; ++ break; ++ } ++ } ++ AuDebugOn(bfound < 0); ++ au_set_h_fptr(file, bfound, NULL); ++ if (bfound == btop) { ++ for (btop++; btop <= bbot; btop++) ++ if (au_hf_dir(file, btop)) { ++ au_set_fbtop(file, btop); ++ break; ++ } ++ } ++ fi_write_unlock(file); ++ } ++} ++ ++static void au_br_do_del_brp(struct au_sbinfo *sbinfo, ++ const aufs_bindex_t bindex, ++ const aufs_bindex_t bbot) ++{ ++ struct au_branch **brp, **p; ++ ++ AuRwMustWriteLock(&sbinfo->si_rwsem); ++ ++ brp = sbinfo->si_branch + bindex; ++ if (bindex < bbot) ++ memmove(brp, brp + 1, sizeof(*brp) * (bbot - bindex)); ++ sbinfo->si_branch[0 + bbot] = NULL; ++ sbinfo->si_bbot--; ++ ++ p = krealloc(sbinfo->si_branch, sizeof(*p) * bbot, AuGFP_SBILIST); ++ if (p) ++ sbinfo->si_branch = p; ++ /* harmless error */ ++} ++ ++static void au_br_do_del_hdp(struct au_dinfo *dinfo, const aufs_bindex_t bindex, ++ const aufs_bindex_t bbot) ++{ ++ struct au_hdentry *hdp, *p; ++ ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ hdp = dinfo->di_hdentry; ++ if (bindex < bbot) ++ memmove(hdp + bindex, hdp + bindex + 1, ++ sizeof(*hdp) * (bbot - bindex)); ++ hdp[0 + bbot].hd_dentry = NULL; ++ dinfo->di_bbot--; ++ ++ p = krealloc(hdp, sizeof(*p) * bbot, AuGFP_SBILIST); ++ if (p) ++ dinfo->di_hdentry = p; ++ /* harmless error */ ++} ++ ++static void au_br_do_del_hip(struct au_iinfo *iinfo, const aufs_bindex_t bindex, ++ const aufs_bindex_t bbot) ++{ ++ struct au_hinode *hip, *p; ++ ++ AuRwMustWriteLock(&iinfo->ii_rwsem); ++ ++ hip = au_hinode(iinfo, bindex); ++ if (bindex < bbot) ++ memmove(hip, hip + 1, sizeof(*hip) * (bbot - bindex)); ++ /* au_hinode_init(au_hinode(iinfo, bbot)); */ ++ iinfo->ii_bbot--; ++ ++ p = krealloc(iinfo->ii_hinode, sizeof(*p) * bbot, AuGFP_SBILIST); ++ if (p) ++ iinfo->ii_hinode = p; ++ /* harmless error */ ++} ++ ++static void au_br_do_del(struct super_block *sb, aufs_bindex_t bindex, ++ struct au_branch *br) ++{ ++ aufs_bindex_t bbot; ++ struct au_sbinfo *sbinfo; ++ struct dentry *root, *h_root; ++ struct inode *inode, *h_inode; ++ struct au_hinode *hinode; ++ ++ SiMustWriteLock(sb); ++ ++ root = sb->s_root; ++ inode = d_inode(root); ++ sbinfo = au_sbi(sb); ++ bbot = sbinfo->si_bbot; ++ ++ h_root = au_h_dptr(root, bindex); ++ hinode = au_hi(inode, bindex); ++ h_inode = au_igrab(hinode->hi_inode); ++ au_hiput(hinode); ++ ++ au_sbilist_lock(); ++ au_br_do_del_brp(sbinfo, bindex, bbot); ++ au_br_do_del_hdp(au_di(root), bindex, bbot); ++ au_br_do_del_hip(au_ii(inode), bindex, bbot); ++ au_sbilist_unlock(); ++ ++ dput(h_root); ++ iput(h_inode); ++ au_br_do_free(br); ++} ++ ++static unsigned long long empty_cb(struct super_block *sb, void *array, ++ unsigned long long max, void *arg) ++{ ++ return max; ++} ++ ++int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount) ++{ ++ int err, rerr, i; ++ unsigned long long opened; ++ unsigned int mnt_flags; ++ aufs_bindex_t bindex, bbot, br_id; ++ unsigned char do_wh, verbose; ++ struct au_branch *br; ++ struct au_wbr *wbr; ++ struct dentry *root; ++ struct file **to_free; ++ ++ err = 0; ++ opened = 0; ++ to_free = NULL; ++ root = sb->s_root; ++ bindex = au_find_dbindex(root, del->h_path.dentry); ++ if (bindex < 0) { ++ if (remount) ++ goto out; /* success */ ++ err = -ENOENT; ++ pr_err("%s no such branch\n", del->pathname); ++ goto out; ++ } ++ AuDbg("bindex b%d\n", bindex); ++ ++ err = -EBUSY; ++ mnt_flags = au_mntflags(sb); ++ verbose = !!au_opt_test(mnt_flags, VERBOSE); ++ bbot = au_sbbot(sb); ++ if (unlikely(!bbot)) { ++ AuVerbose(verbose, "no more branches left\n"); ++ goto out; ++ } ++ br = au_sbr(sb, bindex); ++ AuDebugOn(!path_equal(&br->br_path, &del->h_path)); ++ ++ br_id = br->br_id; ++ opened = au_br_count(br); ++ if (unlikely(opened)) { ++ to_free = au_array_alloc(&opened, empty_cb, sb, NULL); ++ err = PTR_ERR(to_free); ++ if (IS_ERR(to_free)) ++ goto out; ++ ++ err = test_file_busy(sb, br_id, to_free, opened); ++ if (unlikely(err)) { ++ AuVerbose(verbose, "%llu file(s) opened\n", opened); ++ goto out; ++ } ++ } ++ ++ wbr = br->br_wbr; ++ do_wh = wbr && (wbr->wbr_whbase || wbr->wbr_plink || wbr->wbr_orph); ++ if (do_wh) { ++ /* instead of WbrWhMustWriteLock(wbr) */ ++ SiMustWriteLock(sb); ++ for (i = 0; i < AuBrWh_Last; i++) { ++ dput(wbr->wbr_wh[i]); ++ wbr->wbr_wh[i] = NULL; ++ } ++ } ++ ++ err = test_children_busy(root, bindex, verbose); ++ if (unlikely(err)) { ++ if (do_wh) ++ goto out_wh; ++ goto out; ++ } ++ ++ err = 0; ++ if (to_free) { ++ /* ++ * now we confirmed the branch is deletable. ++ * let's free the remaining opened dirs on the branch. ++ */ ++ di_write_unlock(root); ++ br_del_file(to_free, opened, br_id); ++ di_write_lock_child(root); ++ } ++ ++ if (!remount) ++ au_br_do_del(sb, bindex, br); ++ else { ++ sysaufs_brs_del(sb, bindex); ++ au_br_do_del(sb, bindex, br); ++ sysaufs_brs_add(sb, bindex); ++ } ++ ++ if (!bindex) { ++ au_cpup_attr_all(d_inode(root), /*force*/1); ++ sb->s_maxbytes = au_sbr_sb(sb, 0)->s_maxbytes; ++ } else ++ au_sub_nlink(d_inode(root), d_inode(del->h_path.dentry)); ++ if (au_opt_test(mnt_flags, PLINK)) ++ au_plink_half_refresh(sb, br_id); ++ ++ if (au_xino_brid(sb) == br_id) ++ au_xino_brid_set(sb, -1); ++ goto out; /* success */ ++ ++out_wh: ++ /* revert */ ++ rerr = au_br_init_wh(sb, br, br->br_perm); ++ if (rerr) ++ pr_warn("failed re-creating base whiteout, %s. (%d)\n", ++ del->pathname, rerr); ++out: ++ if (to_free) ++ au_farray_free(to_free, opened); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_ibusy(struct super_block *sb, struct aufs_ibusy __user *arg) ++{ ++ int err; ++ aufs_bindex_t btop, bbot; ++ struct aufs_ibusy ibusy; ++ struct inode *inode, *h_inode; ++ ++ err = -EPERM; ++ if (unlikely(!capable(CAP_SYS_ADMIN))) ++ goto out; ++ ++ err = copy_from_user(&ibusy, arg, sizeof(ibusy)); ++ if (!err) ++ err = !access_ok(VERIFY_WRITE, &arg->h_ino, sizeof(arg->h_ino)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ goto out; ++ } ++ ++ err = -EINVAL; ++ si_read_lock(sb, AuLock_FLUSH); ++ if (unlikely(ibusy.bindex < 0 || ibusy.bindex > au_sbbot(sb))) ++ goto out_unlock; ++ ++ err = 0; ++ ibusy.h_ino = 0; /* invalid */ ++ inode = ilookup(sb, ibusy.ino); ++ if (!inode ++ || inode->i_ino == AUFS_ROOT_INO ++ || is_bad_inode(inode)) ++ goto out_unlock; ++ ++ ii_read_lock_child(inode); ++ btop = au_ibtop(inode); ++ bbot = au_ibbot(inode); ++ if (btop <= ibusy.bindex && ibusy.bindex <= bbot) { ++ h_inode = au_h_iptr(inode, ibusy.bindex); ++ if (h_inode && au_test_ibusy(inode, btop, bbot)) ++ ibusy.h_ino = h_inode->i_ino; ++ } ++ ii_read_unlock(inode); ++ iput(inode); ++ ++out_unlock: ++ si_read_unlock(sb); ++ if (!err) { ++ err = __put_user(ibusy.h_ino, &arg->h_ino); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ } ++ } ++out: ++ return err; ++} ++ ++long au_ibusy_ioctl(struct file *file, unsigned long arg) ++{ ++ return au_ibusy(file->f_path.dentry->d_sb, (void __user *)arg); ++} ++ ++#ifdef CONFIG_COMPAT ++long au_ibusy_compat_ioctl(struct file *file, unsigned long arg) ++{ ++ return au_ibusy(file->f_path.dentry->d_sb, compat_ptr(arg)); ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * change a branch permission ++ */ ++ ++static void au_warn_ima(void) ++{ ++#ifdef CONFIG_IMA ++ /* since it doesn't support mark_files_ro() */ ++ AuWarn1("RW -> RO makes IMA to produce wrong message\n"); ++#endif ++} ++ ++static int do_need_sigen_inc(int a, int b) ++{ ++ return au_br_whable(a) && !au_br_whable(b); ++} ++ ++static int need_sigen_inc(int old, int new) ++{ ++ return do_need_sigen_inc(old, new) ++ || do_need_sigen_inc(new, old); ++} ++ ++static int au_br_mod_files_ro(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ int err, do_warn; ++ unsigned int mnt_flags; ++ unsigned long long ull, max; ++ aufs_bindex_t br_id; ++ unsigned char verbose, writer; ++ struct file *file, *hf, **array; ++ struct au_hfile *hfile; ++ ++ mnt_flags = au_mntflags(sb); ++ verbose = !!au_opt_test(mnt_flags, VERBOSE); ++ ++ array = au_farray_alloc(sb, &max); ++ err = PTR_ERR(array); ++ if (IS_ERR(array)) ++ goto out; ++ ++ do_warn = 0; ++ br_id = au_sbr_id(sb, bindex); ++ for (ull = 0; ull < max; ull++) { ++ file = array[ull]; ++ if (unlikely(!file)) ++ break; ++ ++ /* AuDbg("%pD\n", file); */ ++ fi_read_lock(file); ++ if (unlikely(au_test_mmapped(file))) { ++ err = -EBUSY; ++ AuVerbose(verbose, "mmapped %pD\n", file); ++ AuDbgFile(file); ++ FiMustNoWaiters(file); ++ fi_read_unlock(file); ++ goto out_array; ++ } ++ ++ hfile = &au_fi(file)->fi_htop; ++ hf = hfile->hf_file; ++ if (!d_is_reg(file->f_path.dentry) ++ || !(file->f_mode & FMODE_WRITE) ++ || hfile->hf_br->br_id != br_id ++ || !(hf->f_mode & FMODE_WRITE)) ++ array[ull] = NULL; ++ else { ++ do_warn = 1; ++ get_file(file); ++ } ++ ++ FiMustNoWaiters(file); ++ fi_read_unlock(file); ++ fput(file); ++ } ++ ++ err = 0; ++ if (do_warn) ++ au_warn_ima(); ++ ++ for (ull = 0; ull < max; ull++) { ++ file = array[ull]; ++ if (!file) ++ continue; ++ ++ /* todo: already flushed? */ ++ /* ++ * fs/super.c:mark_files_ro() is gone, but aufs keeps its ++ * approach which resets f_mode and calls mnt_drop_write() and ++ * file_release_write() for each file, because the branch ++ * attribute in aufs world is totally different from the native ++ * fs rw/ro mode. ++ */ ++ /* fi_read_lock(file); */ ++ hfile = &au_fi(file)->fi_htop; ++ hf = hfile->hf_file; ++ /* fi_read_unlock(file); */ ++ spin_lock(&hf->f_lock); ++ writer = !!(hf->f_mode & FMODE_WRITER); ++ hf->f_mode &= ~(FMODE_WRITE | FMODE_WRITER); ++ spin_unlock(&hf->f_lock); ++ if (writer) { ++ put_write_access(file_inode(hf)); ++ __mnt_drop_write(hf->f_path.mnt); ++ } ++ } ++ ++out_array: ++ au_farray_free(array, max); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount, ++ int *do_refresh) ++{ ++ int err, rerr; ++ aufs_bindex_t bindex; ++ struct dentry *root; ++ struct au_branch *br; ++ struct au_br_fhsm *bf; ++ ++ root = sb->s_root; ++ bindex = au_find_dbindex(root, mod->h_root); ++ if (bindex < 0) { ++ if (remount) ++ return 0; /* success */ ++ err = -ENOENT; ++ pr_err("%s no such branch\n", mod->path); ++ goto out; ++ } ++ AuDbg("bindex b%d\n", bindex); ++ ++ err = test_br(d_inode(mod->h_root), mod->perm, mod->path); ++ if (unlikely(err)) ++ goto out; ++ ++ br = au_sbr(sb, bindex); ++ AuDebugOn(mod->h_root != au_br_dentry(br)); ++ if (br->br_perm == mod->perm) ++ return 0; /* success */ ++ ++ /* pre-allocate for non-fhsm --> fhsm */ ++ bf = NULL; ++ if (!au_br_fhsm(br->br_perm) && au_br_fhsm(mod->perm)) { ++ err = au_fhsm_br_alloc(br); ++ if (unlikely(err)) ++ goto out; ++ bf = br->br_fhsm; ++ br->br_fhsm = NULL; ++ } ++ ++ if (au_br_writable(br->br_perm)) { ++ /* remove whiteout base */ ++ err = au_br_init_wh(sb, br, mod->perm); ++ if (unlikely(err)) ++ goto out_bf; ++ ++ if (!au_br_writable(mod->perm)) { ++ /* rw --> ro, file might be mmapped */ ++ DiMustNoWaiters(root); ++ IiMustNoWaiters(d_inode(root)); ++ di_write_unlock(root); ++ err = au_br_mod_files_ro(sb, bindex); ++ /* aufs_write_lock() calls ..._child() */ ++ di_write_lock_child(root); ++ ++ if (unlikely(err)) { ++ rerr = -ENOMEM; ++ br->br_wbr = kzalloc(sizeof(*br->br_wbr), ++ GFP_NOFS); ++ if (br->br_wbr) ++ rerr = au_wbr_init(br, sb, br->br_perm); ++ if (unlikely(rerr)) { ++ AuIOErr("nested error %d (%d)\n", ++ rerr, err); ++ br->br_perm = mod->perm; ++ } ++ } ++ } ++ } else if (au_br_writable(mod->perm)) { ++ /* ro --> rw */ ++ err = -ENOMEM; ++ br->br_wbr = kzalloc(sizeof(*br->br_wbr), GFP_NOFS); ++ if (br->br_wbr) { ++ err = au_wbr_init(br, sb, mod->perm); ++ if (unlikely(err)) { ++ kfree(br->br_wbr); ++ br->br_wbr = NULL; ++ } ++ } ++ } ++ if (unlikely(err)) ++ goto out_bf; ++ ++ if (au_br_fhsm(br->br_perm)) { ++ if (!au_br_fhsm(mod->perm)) { ++ /* fhsm --> non-fhsm */ ++ au_br_fhsm_fin(br->br_fhsm); ++ kfree(br->br_fhsm); ++ br->br_fhsm = NULL; ++ } ++ } else if (au_br_fhsm(mod->perm)) ++ /* non-fhsm --> fhsm */ ++ br->br_fhsm = bf; ++ ++ *do_refresh |= need_sigen_inc(br->br_perm, mod->perm); ++ br->br_perm = mod->perm; ++ goto out; /* success */ ++ ++out_bf: ++ kfree(bf); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_br_stfs(struct au_branch *br, struct aufs_stfs *stfs) ++{ ++ int err; ++ struct kstatfs kstfs; ++ ++ err = vfs_statfs(&br->br_path, &kstfs); ++ if (!err) { ++ stfs->f_blocks = kstfs.f_blocks; ++ stfs->f_bavail = kstfs.f_bavail; ++ stfs->f_files = kstfs.f_files; ++ stfs->f_ffree = kstfs.f_ffree; ++ } ++ ++ return err; ++} +diff --git a/fs/aufs/branch.h b/fs/aufs/branch.h +new file mode 100644 +index 0000000..bbfa76b +--- /dev/null ++++ b/fs/aufs/branch.h +@@ -0,0 +1,309 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * branch filesystems and xino for them ++ */ ++ ++#ifndef __AUFS_BRANCH_H__ ++#define __AUFS_BRANCH_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include "dynop.h" ++#include "rwsem.h" ++#include "super.h" ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* a xino file */ ++struct au_xino_file { ++ struct file *xi_file; ++ struct mutex xi_nondir_mtx; ++ ++ /* todo: make xino files an array to support huge inode number */ ++ ++#ifdef CONFIG_DEBUG_FS ++ struct dentry *xi_dbgaufs; ++#endif ++}; ++ ++/* File-based Hierarchical Storage Management */ ++struct au_br_fhsm { ++#ifdef CONFIG_AUFS_FHSM ++ struct mutex bf_lock; ++ unsigned long bf_jiffy; ++ struct aufs_stfs bf_stfs; ++ int bf_readable; ++#endif ++}; ++ ++/* members for writable branch only */ ++enum {AuBrWh_BASE, AuBrWh_PLINK, AuBrWh_ORPH, AuBrWh_Last}; ++struct au_wbr { ++ struct au_rwsem wbr_wh_rwsem; ++ struct dentry *wbr_wh[AuBrWh_Last]; ++ atomic_t wbr_wh_running; ++#define wbr_whbase wbr_wh[AuBrWh_BASE] /* whiteout base */ ++#define wbr_plink wbr_wh[AuBrWh_PLINK] /* pseudo-link dir */ ++#define wbr_orph wbr_wh[AuBrWh_ORPH] /* dir for orphans */ ++ ++ /* mfs mode */ ++ unsigned long long wbr_bytes; ++}; ++ ++/* ext2 has 3 types of operations at least, ext3 has 4 */ ++#define AuBrDynOp (AuDyLast * 4) ++ ++#ifdef CONFIG_AUFS_HFSNOTIFY ++/* support for asynchronous destruction */ ++struct au_br_hfsnotify { ++ struct fsnotify_group *hfsn_group; ++}; ++#endif ++ ++/* sysfs entries */ ++struct au_brsysfs { ++ char name[16]; ++ struct attribute attr; ++}; ++ ++enum { ++ AuBrSysfs_BR, ++ AuBrSysfs_BRID, ++ AuBrSysfs_Last ++}; ++ ++/* protected by superblock rwsem */ ++struct au_branch { ++ struct au_xino_file br_xino; ++ ++ aufs_bindex_t br_id; ++ ++ int br_perm; ++ struct path br_path; ++ spinlock_t br_dykey_lock; ++ struct au_dykey *br_dykey[AuBrDynOp]; ++ struct percpu_counter br_count; ++ ++ struct au_wbr *br_wbr; ++ struct au_br_fhsm *br_fhsm; ++ ++ /* xino truncation */ ++ atomic_t br_xino_running; ++ ++#ifdef CONFIG_AUFS_HFSNOTIFY ++ struct au_br_hfsnotify *br_hfsn; ++#endif ++ ++#ifdef CONFIG_SYSFS ++ /* entries under sysfs per mount-point */ ++ struct au_brsysfs br_sysfs[AuBrSysfs_Last]; ++#endif ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct vfsmount *au_br_mnt(struct au_branch *br) ++{ ++ return br->br_path.mnt; ++} ++ ++static inline struct dentry *au_br_dentry(struct au_branch *br) ++{ ++ return br->br_path.dentry; ++} ++ ++static inline struct super_block *au_br_sb(struct au_branch *br) ++{ ++ return au_br_mnt(br)->mnt_sb; ++} ++ ++static inline void au_br_get(struct au_branch *br) ++{ ++ percpu_counter_inc(&br->br_count); ++} ++ ++static inline void au_br_put(struct au_branch *br) ++{ ++ percpu_counter_dec(&br->br_count); ++} ++ ++static inline s64 au_br_count(struct au_branch *br) ++{ ++ return percpu_counter_sum(&br->br_count); ++} ++ ++static inline void au_br_count_init(struct au_branch *br) ++{ ++ percpu_counter_init(&br->br_count, 0, GFP_NOFS); ++} ++ ++static inline void au_br_count_fin(struct au_branch *br) ++{ ++ percpu_counter_destroy(&br->br_count); ++} ++ ++static inline int au_br_rdonly(struct au_branch *br) ++{ ++ return ((au_br_sb(br)->s_flags & MS_RDONLY) ++ || !au_br_writable(br->br_perm)) ++ ? -EROFS : 0; ++} ++ ++static inline int au_br_hnotifyable(int brperm __maybe_unused) ++{ ++#ifdef CONFIG_AUFS_HNOTIFY ++ return !(brperm & AuBrPerm_RR); ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_br_test_oflag(int oflag, struct au_branch *br) ++{ ++ int err, exec_flag; ++ ++ err = 0; ++ exec_flag = oflag & __FMODE_EXEC; ++ if (unlikely(exec_flag && path_noexec(&br->br_path))) ++ err = -EACCES; ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* branch.c */ ++struct au_sbinfo; ++void au_br_free(struct au_sbinfo *sinfo); ++int au_br_index(struct super_block *sb, aufs_bindex_t br_id); ++struct au_opt_add; ++int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount); ++struct au_opt_del; ++int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount); ++long au_ibusy_ioctl(struct file *file, unsigned long arg); ++#ifdef CONFIG_COMPAT ++long au_ibusy_compat_ioctl(struct file *file, unsigned long arg); ++#endif ++struct au_opt_mod; ++int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount, ++ int *do_refresh); ++struct aufs_stfs; ++int au_br_stfs(struct au_branch *br, struct aufs_stfs *stfs); ++ ++/* xino.c */ ++static const loff_t au_loff_max = LLONG_MAX; ++ ++int au_xib_trunc(struct super_block *sb); ++ssize_t xino_fread(vfs_readf_t func, struct file *file, void *buf, size_t size, ++ loff_t *pos); ++ssize_t xino_fwrite(vfs_writef_t func, struct file *file, void *buf, ++ size_t size, loff_t *pos); ++struct file *au_xino_create2(struct file *base_file, struct file *copy_src); ++struct file *au_xino_create(struct super_block *sb, char *fname, int silent); ++ino_t au_xino_new_ino(struct super_block *sb); ++void au_xino_delete_inode(struct inode *inode, const int unlinked); ++int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ ino_t ino); ++int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ ino_t *ino); ++int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t hino, ++ struct file *base_file, int do_test); ++int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex); ++ ++struct au_opt_xino; ++int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount); ++void au_xino_clr(struct super_block *sb); ++struct file *au_xino_def(struct super_block *sb); ++int au_xino_path(struct seq_file *seq, struct file *file); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* Superblock to branch */ ++static inline ++aufs_bindex_t au_sbr_id(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_sbr(sb, bindex)->br_id; ++} ++ ++static inline ++struct vfsmount *au_sbr_mnt(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_br_mnt(au_sbr(sb, bindex)); ++} ++ ++static inline ++struct super_block *au_sbr_sb(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_br_sb(au_sbr(sb, bindex)); ++} ++ ++static inline void au_sbr_get(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ au_br_get(au_sbr(sb, bindex)); ++} ++ ++static inline void au_sbr_put(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ au_br_put(au_sbr(sb, bindex)); ++} ++ ++static inline int au_sbr_perm(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_sbr(sb, bindex)->br_perm; ++} ++ ++static inline int au_sbr_whable(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_br_whable(au_sbr_perm(sb, bindex)); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * wbr_wh_read_lock, wbr_wh_write_lock ++ * wbr_wh_read_unlock, wbr_wh_write_unlock, wbr_wh_downgrade_lock ++ */ ++AuSimpleRwsemFuncs(wbr_wh, struct au_wbr *wbr, &wbr->wbr_wh_rwsem); ++ ++#define WbrWhMustNoWaiters(wbr) AuRwMustNoWaiters(&wbr->wbr_wh_rwsem) ++#define WbrWhMustAnyLock(wbr) AuRwMustAnyLock(&wbr->wbr_wh_rwsem) ++#define WbrWhMustWriteLock(wbr) AuRwMustWriteLock(&wbr->wbr_wh_rwsem) ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_FHSM ++static inline void au_br_fhsm_init(struct au_br_fhsm *brfhsm) ++{ ++ mutex_init(&brfhsm->bf_lock); ++ brfhsm->bf_jiffy = 0; ++ brfhsm->bf_readable = 0; ++} ++ ++static inline void au_br_fhsm_fin(struct au_br_fhsm *brfhsm) ++{ ++ mutex_destroy(&brfhsm->bf_lock); ++} ++#else ++AuStubVoid(au_br_fhsm_init, struct au_br_fhsm *brfhsm) ++AuStubVoid(au_br_fhsm_fin, struct au_br_fhsm *brfhsm) ++#endif ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_BRANCH_H__ */ +diff --git a/fs/aufs/conf.mk b/fs/aufs/conf.mk +new file mode 100644 +index 0000000..0bbb2d3 +--- /dev/null ++++ b/fs/aufs/conf.mk +@@ -0,0 +1,38 @@ ++ ++AuConfStr = CONFIG_AUFS_FS=${CONFIG_AUFS_FS} ++ ++define AuConf ++ifdef ${1} ++AuConfStr += ${1}=${${1}} ++endif ++endef ++ ++AuConfAll = BRANCH_MAX_127 BRANCH_MAX_511 BRANCH_MAX_1023 BRANCH_MAX_32767 \ ++ SBILIST \ ++ HNOTIFY HFSNOTIFY \ ++ EXPORT INO_T_64 \ ++ XATTR \ ++ FHSM \ ++ RDU \ ++ SHWH \ ++ BR_RAMFS \ ++ BR_FUSE POLL \ ++ BR_HFSPLUS \ ++ BDEV_LOOP \ ++ DEBUG MAGIC_SYSRQ ++$(foreach i, ${AuConfAll}, \ ++ $(eval $(call AuConf,CONFIG_AUFS_${i}))) ++ ++AuConfName = ${obj}/conf.str ++${AuConfName}.tmp: FORCE ++ @echo ${AuConfStr} | tr ' ' '\n' | sed -e 's/^/"/' -e 's/$$/\\n"/' > $@ ++${AuConfName}: ${AuConfName}.tmp ++ @diff -q $< $@ > /dev/null 2>&1 || { \ ++ echo ' GEN ' $@; \ ++ cp -p $< $@; \ ++ } ++FORCE: ++clean-files += ${AuConfName} ${AuConfName}.tmp ++${obj}/sysfs.o: ${AuConfName} ++ ++-include ${srctree}/${src}/conf_priv.mk +diff --git a/fs/aufs/cpup.c b/fs/aufs/cpup.c +new file mode 100644 +index 0000000..584870a +--- /dev/null ++++ b/fs/aufs/cpup.c +@@ -0,0 +1,1379 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * copy-up functions, see wbr_policy.c for copy-down ++ */ ++ ++#include ++#include ++#include ++#include "aufs.h" ++ ++void au_cpup_attr_flags(struct inode *dst, unsigned int iflags) ++{ ++ const unsigned int mask = S_DEAD | S_SWAPFILE | S_PRIVATE ++ | S_NOATIME | S_NOCMTIME | S_AUTOMOUNT; ++ ++ BUILD_BUG_ON(sizeof(iflags) != sizeof(dst->i_flags)); ++ ++ dst->i_flags |= iflags & ~mask; ++ if (au_test_fs_notime(dst->i_sb)) ++ dst->i_flags |= S_NOATIME | S_NOCMTIME; ++} ++ ++void au_cpup_attr_timesizes(struct inode *inode) ++{ ++ struct inode *h_inode; ++ ++ h_inode = au_h_iptr(inode, au_ibtop(inode)); ++ fsstack_copy_attr_times(inode, h_inode); ++ fsstack_copy_inode_size(inode, h_inode); ++} ++ ++void au_cpup_attr_nlink(struct inode *inode, int force) ++{ ++ struct inode *h_inode; ++ struct super_block *sb; ++ aufs_bindex_t bindex, bbot; ++ ++ sb = inode->i_sb; ++ bindex = au_ibtop(inode); ++ h_inode = au_h_iptr(inode, bindex); ++ if (!force ++ && !S_ISDIR(h_inode->i_mode) ++ && au_opt_test(au_mntflags(sb), PLINK) ++ && au_plink_test(inode)) ++ return; ++ ++ /* ++ * 0 can happen in revalidating. ++ * h_inode->i_mutex may not be held here, but it is harmless since once ++ * i_nlink reaches 0, it will never become positive except O_TMPFILE ++ * case. ++ * todo: O_TMPFILE+linkat(AT_SYMLINK_FOLLOW) bypassing aufs may cause ++ * the incorrect link count. ++ */ ++ set_nlink(inode, h_inode->i_nlink); ++ ++ /* ++ * fewer nlink makes find(1) noisy, but larger nlink doesn't. ++ * it may includes whplink directory. ++ */ ++ if (S_ISDIR(h_inode->i_mode)) { ++ bbot = au_ibbot(inode); ++ for (bindex++; bindex <= bbot; bindex++) { ++ h_inode = au_h_iptr(inode, bindex); ++ if (h_inode) ++ au_add_nlink(inode, h_inode); ++ } ++ } ++} ++ ++void au_cpup_attr_changeable(struct inode *inode) ++{ ++ struct inode *h_inode; ++ ++ h_inode = au_h_iptr(inode, au_ibtop(inode)); ++ inode->i_mode = h_inode->i_mode; ++ inode->i_uid = h_inode->i_uid; ++ inode->i_gid = h_inode->i_gid; ++ au_cpup_attr_timesizes(inode); ++ au_cpup_attr_flags(inode, h_inode->i_flags); ++} ++ ++void au_cpup_igen(struct inode *inode, struct inode *h_inode) ++{ ++ struct au_iinfo *iinfo = au_ii(inode); ++ ++ IiMustWriteLock(inode); ++ ++ iinfo->ii_higen = h_inode->i_generation; ++ iinfo->ii_hsb1 = h_inode->i_sb; ++} ++ ++void au_cpup_attr_all(struct inode *inode, int force) ++{ ++ struct inode *h_inode; ++ ++ h_inode = au_h_iptr(inode, au_ibtop(inode)); ++ au_cpup_attr_changeable(inode); ++ if (inode->i_nlink > 0) ++ au_cpup_attr_nlink(inode, force); ++ inode->i_rdev = h_inode->i_rdev; ++ inode->i_blkbits = h_inode->i_blkbits; ++ au_cpup_igen(inode, h_inode); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* Note: dt_dentry and dt_h_dentry are not dget/dput-ed */ ++ ++/* keep the timestamps of the parent dir when cpup */ ++void au_dtime_store(struct au_dtime *dt, struct dentry *dentry, ++ struct path *h_path) ++{ ++ struct inode *h_inode; ++ ++ dt->dt_dentry = dentry; ++ dt->dt_h_path = *h_path; ++ h_inode = d_inode(h_path->dentry); ++ dt->dt_atime = h_inode->i_atime; ++ dt->dt_mtime = h_inode->i_mtime; ++ /* smp_mb(); */ ++} ++ ++void au_dtime_revert(struct au_dtime *dt) ++{ ++ struct iattr attr; ++ int err; ++ ++ attr.ia_atime = dt->dt_atime; ++ attr.ia_mtime = dt->dt_mtime; ++ attr.ia_valid = ATTR_FORCE | ATTR_MTIME | ATTR_MTIME_SET ++ | ATTR_ATIME | ATTR_ATIME_SET; ++ ++ /* no delegation since this is a directory */ ++ err = vfsub_notify_change(&dt->dt_h_path, &attr, /*delegated*/NULL); ++ if (unlikely(err)) ++ pr_warn("restoring timestamps failed(%d). ignored\n", err); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* internal use only */ ++struct au_cpup_reg_attr { ++ int valid; ++ struct kstat st; ++ unsigned int iflags; /* inode->i_flags */ ++}; ++ ++static noinline_for_stack ++int cpup_iattr(struct dentry *dst, aufs_bindex_t bindex, struct dentry *h_src, ++ struct au_cpup_reg_attr *h_src_attr) ++{ ++ int err, sbits, icex; ++ unsigned int mnt_flags; ++ unsigned char verbose; ++ struct iattr ia; ++ struct path h_path; ++ struct inode *h_isrc, *h_idst; ++ struct kstat *h_st; ++ struct au_branch *br; ++ ++ h_path.dentry = au_h_dptr(dst, bindex); ++ h_idst = d_inode(h_path.dentry); ++ br = au_sbr(dst->d_sb, bindex); ++ h_path.mnt = au_br_mnt(br); ++ h_isrc = d_inode(h_src); ++ ia.ia_valid = ATTR_FORCE | ATTR_UID | ATTR_GID ++ | ATTR_ATIME | ATTR_MTIME ++ | ATTR_ATIME_SET | ATTR_MTIME_SET; ++ if (h_src_attr && h_src_attr->valid) { ++ h_st = &h_src_attr->st; ++ ia.ia_uid = h_st->uid; ++ ia.ia_gid = h_st->gid; ++ ia.ia_atime = h_st->atime; ++ ia.ia_mtime = h_st->mtime; ++ if (h_idst->i_mode != h_st->mode ++ && !S_ISLNK(h_idst->i_mode)) { ++ ia.ia_valid |= ATTR_MODE; ++ ia.ia_mode = h_st->mode; ++ } ++ sbits = !!(h_st->mode & (S_ISUID | S_ISGID)); ++ au_cpup_attr_flags(h_idst, h_src_attr->iflags); ++ } else { ++ ia.ia_uid = h_isrc->i_uid; ++ ia.ia_gid = h_isrc->i_gid; ++ ia.ia_atime = h_isrc->i_atime; ++ ia.ia_mtime = h_isrc->i_mtime; ++ if (h_idst->i_mode != h_isrc->i_mode ++ && !S_ISLNK(h_idst->i_mode)) { ++ ia.ia_valid |= ATTR_MODE; ++ ia.ia_mode = h_isrc->i_mode; ++ } ++ sbits = !!(h_isrc->i_mode & (S_ISUID | S_ISGID)); ++ au_cpup_attr_flags(h_idst, h_isrc->i_flags); ++ } ++ /* no delegation since it is just created */ ++ err = vfsub_notify_change(&h_path, &ia, /*delegated*/NULL); ++ ++ /* is this nfs only? */ ++ if (!err && sbits && au_test_nfs(h_path.dentry->d_sb)) { ++ ia.ia_valid = ATTR_FORCE | ATTR_MODE; ++ ia.ia_mode = h_isrc->i_mode; ++ err = vfsub_notify_change(&h_path, &ia, /*delegated*/NULL); ++ } ++ ++ icex = br->br_perm & AuBrAttr_ICEX; ++ if (!err) { ++ mnt_flags = au_mntflags(dst->d_sb); ++ verbose = !!au_opt_test(mnt_flags, VERBOSE); ++ err = au_cpup_xattr(h_path.dentry, h_src, icex, verbose); ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_do_copy_file(struct file *dst, struct file *src, loff_t len, ++ char *buf, unsigned long blksize) ++{ ++ int err; ++ size_t sz, rbytes, wbytes; ++ unsigned char all_zero; ++ char *p, *zp; ++ struct inode *h_inode; ++ /* reduce stack usage */ ++ struct iattr *ia; ++ ++ zp = page_address(ZERO_PAGE(0)); ++ if (unlikely(!zp)) ++ return -ENOMEM; /* possible? */ ++ ++ err = 0; ++ all_zero = 0; ++ while (len) { ++ AuDbg("len %lld\n", len); ++ sz = blksize; ++ if (len < blksize) ++ sz = len; ++ ++ rbytes = 0; ++ /* todo: signal_pending? */ ++ while (!rbytes || err == -EAGAIN || err == -EINTR) { ++ rbytes = vfsub_read_k(src, buf, sz, &src->f_pos); ++ err = rbytes; ++ } ++ if (unlikely(err < 0)) ++ break; ++ ++ all_zero = 0; ++ if (len >= rbytes && rbytes == blksize) ++ all_zero = !memcmp(buf, zp, rbytes); ++ if (!all_zero) { ++ wbytes = rbytes; ++ p = buf; ++ while (wbytes) { ++ size_t b; ++ ++ b = vfsub_write_k(dst, p, wbytes, &dst->f_pos); ++ err = b; ++ /* todo: signal_pending? */ ++ if (unlikely(err == -EAGAIN || err == -EINTR)) ++ continue; ++ if (unlikely(err < 0)) ++ break; ++ wbytes -= b; ++ p += b; ++ } ++ if (unlikely(err < 0)) ++ break; ++ } else { ++ loff_t res; ++ ++ AuLabel(hole); ++ res = vfsub_llseek(dst, rbytes, SEEK_CUR); ++ err = res; ++ if (unlikely(res < 0)) ++ break; ++ } ++ len -= rbytes; ++ err = 0; ++ } ++ ++ /* the last block may be a hole */ ++ if (!err && all_zero) { ++ AuLabel(last hole); ++ ++ err = 1; ++ if (au_test_nfs(dst->f_path.dentry->d_sb)) { ++ /* nfs requires this step to make last hole */ ++ /* is this only nfs? */ ++ do { ++ /* todo: signal_pending? */ ++ err = vfsub_write_k(dst, "\0", 1, &dst->f_pos); ++ } while (err == -EAGAIN || err == -EINTR); ++ if (err == 1) ++ dst->f_pos--; ++ } ++ ++ if (err == 1) { ++ ia = (void *)buf; ++ ia->ia_size = dst->f_pos; ++ ia->ia_valid = ATTR_SIZE | ATTR_FILE; ++ ia->ia_file = dst; ++ h_inode = file_inode(dst); ++ inode_lock_nested(h_inode, AuLsc_I_CHILD2); ++ /* no delegation since it is just created */ ++ err = vfsub_notify_change(&dst->f_path, ia, ++ /*delegated*/NULL); ++ inode_unlock(h_inode); ++ } ++ } ++ ++ return err; ++} ++ ++int au_copy_file(struct file *dst, struct file *src, loff_t len) ++{ ++ int err; ++ unsigned long blksize; ++ unsigned char do_kfree; ++ char *buf; ++ ++ err = -ENOMEM; ++ blksize = dst->f_path.dentry->d_sb->s_blocksize; ++ if (!blksize || PAGE_SIZE < blksize) ++ blksize = PAGE_SIZE; ++ AuDbg("blksize %lu\n", blksize); ++ do_kfree = (blksize != PAGE_SIZE && blksize >= sizeof(struct iattr *)); ++ if (do_kfree) ++ buf = kmalloc(blksize, GFP_NOFS); ++ else ++ buf = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!buf)) ++ goto out; ++ ++ if (len > (1 << 22)) ++ AuDbg("copying a large file %lld\n", (long long)len); ++ ++ src->f_pos = 0; ++ dst->f_pos = 0; ++ err = au_do_copy_file(dst, src, len, buf, blksize); ++ if (do_kfree) ++ kfree(buf); ++ else ++ free_page((unsigned long)buf); ++ ++out: ++ return err; ++} ++ ++/* ++ * to support a sparse file which is opened with O_APPEND, ++ * we need to close the file. ++ */ ++static int au_cp_regular(struct au_cp_generic *cpg) ++{ ++ int err, i; ++ enum { SRC, DST }; ++ struct { ++ aufs_bindex_t bindex; ++ unsigned int flags; ++ struct dentry *dentry; ++ int force_wr; ++ struct file *file; ++ void *label; ++ } *f, file[] = { ++ { ++ .bindex = cpg->bsrc, ++ .flags = O_RDONLY | O_NOATIME | O_LARGEFILE, ++ .label = &&out ++ }, ++ { ++ .bindex = cpg->bdst, ++ .flags = O_WRONLY | O_NOATIME | O_LARGEFILE, ++ .force_wr = !!au_ftest_cpup(cpg->flags, RWDST), ++ .label = &&out_src ++ } ++ }; ++ struct super_block *sb; ++ struct task_struct *tsk = current; ++ ++ /* bsrc branch can be ro/rw. */ ++ sb = cpg->dentry->d_sb; ++ f = file; ++ for (i = 0; i < 2; i++, f++) { ++ f->dentry = au_h_dptr(cpg->dentry, f->bindex); ++ f->file = au_h_open(cpg->dentry, f->bindex, f->flags, ++ /*file*/NULL, f->force_wr); ++ err = PTR_ERR(f->file); ++ if (IS_ERR(f->file)) ++ goto *f->label; ++ } ++ ++ /* try stopping to update while we copyup */ ++ IMustLock(d_inode(file[SRC].dentry)); ++ err = au_copy_file(file[DST].file, file[SRC].file, cpg->len); ++ ++ /* i wonder if we had O_NO_DELAY_FPUT flag */ ++ if (tsk->flags & PF_KTHREAD) ++ __fput_sync(file[DST].file); ++ else { ++ WARN(1, "%pD\nPlease report this warning to aufs-users ML", ++ file[DST].file); ++ fput(file[DST].file); ++ /* ++ * too bad. ++ * we have to call both since we don't know which place the file ++ * was added to. ++ */ ++ task_work_run(); ++ flush_delayed_fput(); ++ } ++ au_sbr_put(sb, file[DST].bindex); ++ ++out_src: ++ fput(file[SRC].file); ++ au_sbr_put(sb, file[SRC].bindex); ++out: ++ return err; ++} ++ ++static int au_do_cpup_regular(struct au_cp_generic *cpg, ++ struct au_cpup_reg_attr *h_src_attr) ++{ ++ int err, rerr; ++ loff_t l; ++ struct path h_path; ++ struct inode *h_src_inode, *h_dst_inode; ++ ++ err = 0; ++ h_src_inode = au_h_iptr(d_inode(cpg->dentry), cpg->bsrc); ++ l = i_size_read(h_src_inode); ++ if (cpg->len == -1 || l < cpg->len) ++ cpg->len = l; ++ if (cpg->len) { ++ /* try stopping to update while we are referencing */ ++ inode_lock_nested(h_src_inode, AuLsc_I_CHILD); ++ au_pin_hdir_unlock(cpg->pin); ++ ++ h_path.dentry = au_h_dptr(cpg->dentry, cpg->bsrc); ++ h_path.mnt = au_sbr_mnt(cpg->dentry->d_sb, cpg->bsrc); ++ h_src_attr->iflags = h_src_inode->i_flags; ++ if (!au_test_nfs(h_src_inode->i_sb)) ++ err = vfs_getattr(&h_path, &h_src_attr->st); ++ else { ++ inode_unlock(h_src_inode); ++ err = vfs_getattr(&h_path, &h_src_attr->st); ++ inode_lock_nested(h_src_inode, AuLsc_I_CHILD); ++ } ++ if (unlikely(err)) { ++ inode_unlock(h_src_inode); ++ goto out; ++ } ++ h_src_attr->valid = 1; ++ err = au_cp_regular(cpg); ++ inode_unlock(h_src_inode); ++ rerr = au_pin_hdir_relock(cpg->pin); ++ if (!err && rerr) ++ err = rerr; ++ } ++ if (!err && (h_src_inode->i_state & I_LINKABLE)) { ++ h_path.dentry = au_h_dptr(cpg->dentry, cpg->bdst); ++ h_dst_inode = d_inode(h_path.dentry); ++ spin_lock(&h_dst_inode->i_lock); ++ h_dst_inode->i_state |= I_LINKABLE; ++ spin_unlock(&h_dst_inode->i_lock); ++ } ++ ++out: ++ return err; ++} ++ ++static int au_do_cpup_symlink(struct path *h_path, struct dentry *h_src, ++ struct inode *h_dir) ++{ ++ int err, symlen; ++ mm_segment_t old_fs; ++ union { ++ char *k; ++ char __user *u; ++ } sym; ++ struct inode *h_inode = d_inode(h_src); ++ const struct inode_operations *h_iop = h_inode->i_op; ++ ++ err = -ENOSYS; ++ if (unlikely(!h_iop->readlink)) ++ goto out; ++ ++ err = -ENOMEM; ++ sym.k = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!sym.k)) ++ goto out; ++ ++ /* unnecessary to support mmap_sem since symlink is not mmap-able */ ++ old_fs = get_fs(); ++ set_fs(KERNEL_DS); ++ symlen = h_iop->readlink(h_src, sym.u, PATH_MAX); ++ err = symlen; ++ set_fs(old_fs); ++ ++ if (symlen > 0) { ++ sym.k[symlen] = 0; ++ err = vfsub_symlink(h_dir, h_path, sym.k); ++ } ++ free_page((unsigned long)sym.k); ++ ++out: ++ return err; ++} ++ ++/* ++ * regardless 'acl' option, reset all ACL. ++ * All ACL will be copied up later from the original entry on the lower branch. ++ */ ++static int au_reset_acl(struct inode *h_dir, struct path *h_path, umode_t mode) ++{ ++ int err; ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ ++ h_dentry = h_path->dentry; ++ h_inode = d_inode(h_dentry); ++ /* forget_all_cached_acls(h_inode)); */ ++ err = vfsub_removexattr(h_dentry, XATTR_NAME_POSIX_ACL_ACCESS); ++ AuTraceErr(err); ++ if (err == -EOPNOTSUPP) ++ err = 0; ++ if (!err) ++ err = vfsub_acl_chmod(h_inode, mode); ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++static int au_do_cpup_dir(struct au_cp_generic *cpg, struct dentry *dst_parent, ++ struct inode *h_dir, struct path *h_path) ++{ ++ int err; ++ struct inode *dir, *inode; ++ ++ err = vfsub_removexattr(h_path->dentry, XATTR_NAME_POSIX_ACL_DEFAULT); ++ AuTraceErr(err); ++ if (err == -EOPNOTSUPP) ++ err = 0; ++ if (unlikely(err)) ++ goto out; ++ ++ /* ++ * strange behaviour from the users view, ++ * particularry setattr case ++ */ ++ dir = d_inode(dst_parent); ++ if (au_ibtop(dir) == cpg->bdst) ++ au_cpup_attr_nlink(dir, /*force*/1); ++ inode = d_inode(cpg->dentry); ++ au_cpup_attr_nlink(inode, /*force*/1); ++ ++out: ++ return err; ++} ++ ++static noinline_for_stack ++int cpup_entry(struct au_cp_generic *cpg, struct dentry *dst_parent, ++ struct au_cpup_reg_attr *h_src_attr) ++{ ++ int err; ++ umode_t mode; ++ unsigned int mnt_flags; ++ unsigned char isdir, isreg, force; ++ const unsigned char do_dt = !!au_ftest_cpup(cpg->flags, DTIME); ++ struct au_dtime dt; ++ struct path h_path; ++ struct dentry *h_src, *h_dst, *h_parent; ++ struct inode *h_inode, *h_dir; ++ struct super_block *sb; ++ ++ /* bsrc branch can be ro/rw. */ ++ h_src = au_h_dptr(cpg->dentry, cpg->bsrc); ++ h_inode = d_inode(h_src); ++ AuDebugOn(h_inode != au_h_iptr(d_inode(cpg->dentry), cpg->bsrc)); ++ ++ /* try stopping to be referenced while we are creating */ ++ h_dst = au_h_dptr(cpg->dentry, cpg->bdst); ++ if (au_ftest_cpup(cpg->flags, RENAME)) ++ AuDebugOn(strncmp(h_dst->d_name.name, AUFS_WH_PFX, ++ AUFS_WH_PFX_LEN)); ++ h_parent = h_dst->d_parent; /* dir inode is locked */ ++ h_dir = d_inode(h_parent); ++ IMustLock(h_dir); ++ AuDebugOn(h_parent != h_dst->d_parent); ++ ++ sb = cpg->dentry->d_sb; ++ h_path.mnt = au_sbr_mnt(sb, cpg->bdst); ++ if (do_dt) { ++ h_path.dentry = h_parent; ++ au_dtime_store(&dt, dst_parent, &h_path); ++ } ++ h_path.dentry = h_dst; ++ ++ isreg = 0; ++ isdir = 0; ++ mode = h_inode->i_mode; ++ switch (mode & S_IFMT) { ++ case S_IFREG: ++ isreg = 1; ++ err = vfsub_create(h_dir, &h_path, S_IRUSR | S_IWUSR, ++ /*want_excl*/true); ++ if (!err) ++ err = au_do_cpup_regular(cpg, h_src_attr); ++ break; ++ case S_IFDIR: ++ isdir = 1; ++ err = vfsub_mkdir(h_dir, &h_path, mode); ++ if (!err) ++ err = au_do_cpup_dir(cpg, dst_parent, h_dir, &h_path); ++ break; ++ case S_IFLNK: ++ err = au_do_cpup_symlink(&h_path, h_src, h_dir); ++ break; ++ case S_IFCHR: ++ case S_IFBLK: ++ AuDebugOn(!capable(CAP_MKNOD)); ++ /*FALLTHROUGH*/ ++ case S_IFIFO: ++ case S_IFSOCK: ++ err = vfsub_mknod(h_dir, &h_path, mode, h_inode->i_rdev); ++ break; ++ default: ++ AuIOErr("Unknown inode type 0%o\n", mode); ++ err = -EIO; ++ } ++ if (!err) ++ err = au_reset_acl(h_dir, &h_path, mode); ++ ++ mnt_flags = au_mntflags(sb); ++ if (!au_opt_test(mnt_flags, UDBA_NONE) ++ && !isdir ++ && au_opt_test(mnt_flags, XINO) ++ && (h_inode->i_nlink == 1 ++ || (h_inode->i_state & I_LINKABLE)) ++ /* todo: unnecessary? */ ++ /* && d_inode(cpg->dentry)->i_nlink == 1 */ ++ && cpg->bdst < cpg->bsrc ++ && !au_ftest_cpup(cpg->flags, KEEPLINO)) ++ au_xino_write(sb, cpg->bsrc, h_inode->i_ino, /*ino*/0); ++ /* ignore this error */ ++ ++ if (!err) { ++ force = 0; ++ if (isreg) { ++ force = !!cpg->len; ++ if (cpg->len == -1) ++ force = !!i_size_read(h_inode); ++ } ++ au_fhsm_wrote(sb, cpg->bdst, force); ++ } ++ ++ if (do_dt) ++ au_dtime_revert(&dt); ++ return err; ++} ++ ++static int au_do_ren_after_cpup(struct au_cp_generic *cpg, struct path *h_path) ++{ ++ int err; ++ struct dentry *dentry, *h_dentry, *h_parent, *parent; ++ struct inode *h_dir; ++ aufs_bindex_t bdst; ++ ++ dentry = cpg->dentry; ++ bdst = cpg->bdst; ++ h_dentry = au_h_dptr(dentry, bdst); ++ if (!au_ftest_cpup(cpg->flags, OVERWRITE)) { ++ dget(h_dentry); ++ au_set_h_dptr(dentry, bdst, NULL); ++ err = au_lkup_neg(dentry, bdst, /*wh*/0); ++ if (!err) ++ h_path->dentry = dget(au_h_dptr(dentry, bdst)); ++ au_set_h_dptr(dentry, bdst, h_dentry); ++ } else { ++ err = 0; ++ parent = dget_parent(dentry); ++ h_parent = au_h_dptr(parent, bdst); ++ dput(parent); ++ h_path->dentry = vfsub_lkup_one(&dentry->d_name, h_parent); ++ if (IS_ERR(h_path->dentry)) ++ err = PTR_ERR(h_path->dentry); ++ } ++ if (unlikely(err)) ++ goto out; ++ ++ h_parent = h_dentry->d_parent; /* dir inode is locked */ ++ h_dir = d_inode(h_parent); ++ IMustLock(h_dir); ++ AuDbg("%pd %pd\n", h_dentry, h_path->dentry); ++ /* no delegation since it is just created */ ++ err = vfsub_rename(h_dir, h_dentry, h_dir, h_path, /*delegated*/NULL); ++ dput(h_path->dentry); ++ ++out: ++ return err; ++} ++ ++/* ++ * copyup the @dentry from @bsrc to @bdst. ++ * the caller must set the both of lower dentries. ++ * @len is for truncating when it is -1 copyup the entire file. ++ * in link/rename cases, @dst_parent may be different from the real one. ++ * basic->bsrc can be larger than basic->bdst. ++ */ ++static int au_cpup_single(struct au_cp_generic *cpg, struct dentry *dst_parent) ++{ ++ int err, rerr; ++ aufs_bindex_t old_ibtop; ++ unsigned char isdir, plink; ++ struct dentry *h_src, *h_dst, *h_parent; ++ struct inode *dst_inode, *h_dir, *inode, *delegated, *src_inode; ++ struct super_block *sb; ++ struct au_branch *br; ++ /* to reuduce stack size */ ++ struct { ++ struct au_dtime dt; ++ struct path h_path; ++ struct au_cpup_reg_attr h_src_attr; ++ } *a; ++ ++ err = -ENOMEM; ++ a = kmalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ a->h_src_attr.valid = 0; ++ ++ sb = cpg->dentry->d_sb; ++ br = au_sbr(sb, cpg->bdst); ++ a->h_path.mnt = au_br_mnt(br); ++ h_dst = au_h_dptr(cpg->dentry, cpg->bdst); ++ h_parent = h_dst->d_parent; /* dir inode is locked */ ++ h_dir = d_inode(h_parent); ++ IMustLock(h_dir); ++ ++ h_src = au_h_dptr(cpg->dentry, cpg->bsrc); ++ inode = d_inode(cpg->dentry); ++ ++ if (!dst_parent) ++ dst_parent = dget_parent(cpg->dentry); ++ else ++ dget(dst_parent); ++ ++ plink = !!au_opt_test(au_mntflags(sb), PLINK); ++ dst_inode = au_h_iptr(inode, cpg->bdst); ++ if (dst_inode) { ++ if (unlikely(!plink)) { ++ err = -EIO; ++ AuIOErr("hi%lu(i%lu) exists on b%d " ++ "but plink is disabled\n", ++ dst_inode->i_ino, inode->i_ino, cpg->bdst); ++ goto out_parent; ++ } ++ ++ if (dst_inode->i_nlink) { ++ const int do_dt = au_ftest_cpup(cpg->flags, DTIME); ++ ++ h_src = au_plink_lkup(inode, cpg->bdst); ++ err = PTR_ERR(h_src); ++ if (IS_ERR(h_src)) ++ goto out_parent; ++ if (unlikely(d_is_negative(h_src))) { ++ err = -EIO; ++ AuIOErr("i%lu exists on b%d " ++ "but not pseudo-linked\n", ++ inode->i_ino, cpg->bdst); ++ dput(h_src); ++ goto out_parent; ++ } ++ ++ if (do_dt) { ++ a->h_path.dentry = h_parent; ++ au_dtime_store(&a->dt, dst_parent, &a->h_path); ++ } ++ ++ a->h_path.dentry = h_dst; ++ delegated = NULL; ++ err = vfsub_link(h_src, h_dir, &a->h_path, &delegated); ++ if (!err && au_ftest_cpup(cpg->flags, RENAME)) ++ err = au_do_ren_after_cpup(cpg, &a->h_path); ++ if (do_dt) ++ au_dtime_revert(&a->dt); ++ if (unlikely(err == -EWOULDBLOCK)) { ++ pr_warn("cannot retry for NFSv4 delegation" ++ " for an internal link\n"); ++ iput(delegated); ++ } ++ dput(h_src); ++ goto out_parent; ++ } else ++ /* todo: cpup_wh_file? */ ++ /* udba work */ ++ au_update_ibrange(inode, /*do_put_zero*/1); ++ } ++ ++ isdir = S_ISDIR(inode->i_mode); ++ old_ibtop = au_ibtop(inode); ++ err = cpup_entry(cpg, dst_parent, &a->h_src_attr); ++ if (unlikely(err)) ++ goto out_rev; ++ dst_inode = d_inode(h_dst); ++ inode_lock_nested(dst_inode, AuLsc_I_CHILD2); ++ /* todo: necessary? */ ++ /* au_pin_hdir_unlock(cpg->pin); */ ++ ++ err = cpup_iattr(cpg->dentry, cpg->bdst, h_src, &a->h_src_attr); ++ if (unlikely(err)) { ++ /* todo: necessary? */ ++ /* au_pin_hdir_relock(cpg->pin); */ /* ignore an error */ ++ inode_unlock(dst_inode); ++ goto out_rev; ++ } ++ ++ if (cpg->bdst < old_ibtop) { ++ if (S_ISREG(inode->i_mode)) { ++ err = au_dy_iaop(inode, cpg->bdst, dst_inode); ++ if (unlikely(err)) { ++ /* ignore an error */ ++ /* au_pin_hdir_relock(cpg->pin); */ ++ inode_unlock(dst_inode); ++ goto out_rev; ++ } ++ } ++ au_set_ibtop(inode, cpg->bdst); ++ } else ++ au_set_ibbot(inode, cpg->bdst); ++ au_set_h_iptr(inode, cpg->bdst, au_igrab(dst_inode), ++ au_hi_flags(inode, isdir)); ++ ++ /* todo: necessary? */ ++ /* err = au_pin_hdir_relock(cpg->pin); */ ++ inode_unlock(dst_inode); ++ if (unlikely(err)) ++ goto out_rev; ++ ++ src_inode = d_inode(h_src); ++ if (!isdir ++ && (src_inode->i_nlink > 1 ++ || src_inode->i_state & I_LINKABLE) ++ && plink) ++ au_plink_append(inode, cpg->bdst, h_dst); ++ ++ if (au_ftest_cpup(cpg->flags, RENAME)) { ++ a->h_path.dentry = h_dst; ++ err = au_do_ren_after_cpup(cpg, &a->h_path); ++ } ++ if (!err) ++ goto out_parent; /* success */ ++ ++ /* revert */ ++out_rev: ++ a->h_path.dentry = h_parent; ++ au_dtime_store(&a->dt, dst_parent, &a->h_path); ++ a->h_path.dentry = h_dst; ++ rerr = 0; ++ if (d_is_positive(h_dst)) { ++ if (!isdir) { ++ /* no delegation since it is just created */ ++ rerr = vfsub_unlink(h_dir, &a->h_path, ++ /*delegated*/NULL, /*force*/0); ++ } else ++ rerr = vfsub_rmdir(h_dir, &a->h_path); ++ } ++ au_dtime_revert(&a->dt); ++ if (rerr) { ++ AuIOErr("failed removing broken entry(%d, %d)\n", err, rerr); ++ err = -EIO; ++ } ++out_parent: ++ dput(dst_parent); ++ kfree(a); ++out: ++ return err; ++} ++ ++#if 0 /* reserved */ ++struct au_cpup_single_args { ++ int *errp; ++ struct au_cp_generic *cpg; ++ struct dentry *dst_parent; ++}; ++ ++static void au_call_cpup_single(void *args) ++{ ++ struct au_cpup_single_args *a = args; ++ ++ au_pin_hdir_acquire_nest(a->cpg->pin); ++ *a->errp = au_cpup_single(a->cpg, a->dst_parent); ++ au_pin_hdir_release(a->cpg->pin); ++} ++#endif ++ ++/* ++ * prevent SIGXFSZ in copy-up. ++ * testing CAP_MKNOD is for generic fs, ++ * but CAP_FSETID is for xfs only, currently. ++ */ ++static int au_cpup_sio_test(struct au_pin *pin, umode_t mode) ++{ ++ int do_sio; ++ struct super_block *sb; ++ struct inode *h_dir; ++ ++ do_sio = 0; ++ sb = au_pinned_parent(pin)->d_sb; ++ if (!au_wkq_test() ++ && (!au_sbi(sb)->si_plink_maint_pid ++ || au_plink_maint(sb, AuLock_NOPLM))) { ++ switch (mode & S_IFMT) { ++ case S_IFREG: ++ /* no condition about RLIMIT_FSIZE and the file size */ ++ do_sio = 1; ++ break; ++ case S_IFCHR: ++ case S_IFBLK: ++ do_sio = !capable(CAP_MKNOD); ++ break; ++ } ++ if (!do_sio) ++ do_sio = ((mode & (S_ISUID | S_ISGID)) ++ && !capable(CAP_FSETID)); ++ /* this workaround may be removed in the future */ ++ if (!do_sio) { ++ h_dir = au_pinned_h_dir(pin); ++ do_sio = h_dir->i_mode & S_ISVTX; ++ } ++ } ++ ++ return do_sio; ++} ++ ++#if 0 /* reserved */ ++int au_sio_cpup_single(struct au_cp_generic *cpg, struct dentry *dst_parent) ++{ ++ int err, wkq_err; ++ struct dentry *h_dentry; ++ ++ h_dentry = au_h_dptr(cpg->dentry, cpg->bsrc); ++ if (!au_cpup_sio_test(pin, d_inode(h_dentry)->i_mode)) ++ err = au_cpup_single(cpg, dst_parent); ++ else { ++ struct au_cpup_single_args args = { ++ .errp = &err, ++ .cpg = cpg, ++ .dst_parent = dst_parent ++ }; ++ wkq_err = au_wkq_wait(au_call_cpup_single, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ return err; ++} ++#endif ++ ++/* ++ * copyup the @dentry from the first active lower branch to @bdst, ++ * using au_cpup_single(). ++ */ ++static int au_cpup_simple(struct au_cp_generic *cpg) ++{ ++ int err; ++ unsigned int flags_orig; ++ struct dentry *dentry; ++ ++ AuDebugOn(cpg->bsrc < 0); ++ ++ dentry = cpg->dentry; ++ DiMustWriteLock(dentry); ++ ++ err = au_lkup_neg(dentry, cpg->bdst, /*wh*/1); ++ if (!err) { ++ flags_orig = cpg->flags; ++ au_fset_cpup(cpg->flags, RENAME); ++ err = au_cpup_single(cpg, NULL); ++ cpg->flags = flags_orig; ++ if (!err) ++ return 0; /* success */ ++ ++ /* revert */ ++ au_set_h_dptr(dentry, cpg->bdst, NULL); ++ au_set_dbtop(dentry, cpg->bsrc); ++ } ++ ++ return err; ++} ++ ++struct au_cpup_simple_args { ++ int *errp; ++ struct au_cp_generic *cpg; ++}; ++ ++static void au_call_cpup_simple(void *args) ++{ ++ struct au_cpup_simple_args *a = args; ++ ++ au_pin_hdir_acquire_nest(a->cpg->pin); ++ *a->errp = au_cpup_simple(a->cpg); ++ au_pin_hdir_release(a->cpg->pin); ++} ++ ++static int au_do_sio_cpup_simple(struct au_cp_generic *cpg) ++{ ++ int err, wkq_err; ++ struct dentry *dentry, *parent; ++ struct file *h_file; ++ struct inode *h_dir; ++ ++ dentry = cpg->dentry; ++ h_file = NULL; ++ if (au_ftest_cpup(cpg->flags, HOPEN)) { ++ AuDebugOn(cpg->bsrc < 0); ++ h_file = au_h_open_pre(dentry, cpg->bsrc, /*force_wr*/0); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; ++ } ++ ++ parent = dget_parent(dentry); ++ h_dir = au_h_iptr(d_inode(parent), cpg->bdst); ++ if (!au_test_h_perm_sio(h_dir, MAY_EXEC | MAY_WRITE) ++ && !au_cpup_sio_test(cpg->pin, d_inode(dentry)->i_mode)) ++ err = au_cpup_simple(cpg); ++ else { ++ struct au_cpup_simple_args args = { ++ .errp = &err, ++ .cpg = cpg ++ }; ++ wkq_err = au_wkq_wait(au_call_cpup_simple, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ dput(parent); ++ if (h_file) ++ au_h_open_post(dentry, cpg->bsrc, h_file); ++ ++out: ++ return err; ++} ++ ++int au_sio_cpup_simple(struct au_cp_generic *cpg) ++{ ++ aufs_bindex_t bsrc, bbot; ++ struct dentry *dentry, *h_dentry; ++ ++ if (cpg->bsrc < 0) { ++ dentry = cpg->dentry; ++ bbot = au_dbbot(dentry); ++ for (bsrc = cpg->bdst + 1; bsrc <= bbot; bsrc++) { ++ h_dentry = au_h_dptr(dentry, bsrc); ++ if (h_dentry) { ++ AuDebugOn(d_is_negative(h_dentry)); ++ break; ++ } ++ } ++ AuDebugOn(bsrc > bbot); ++ cpg->bsrc = bsrc; ++ } ++ AuDebugOn(cpg->bsrc <= cpg->bdst); ++ return au_do_sio_cpup_simple(cpg); ++} ++ ++int au_sio_cpdown_simple(struct au_cp_generic *cpg) ++{ ++ AuDebugOn(cpg->bdst <= cpg->bsrc); ++ return au_do_sio_cpup_simple(cpg); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * copyup the deleted file for writing. ++ */ ++static int au_do_cpup_wh(struct au_cp_generic *cpg, struct dentry *wh_dentry, ++ struct file *file) ++{ ++ int err; ++ unsigned int flags_orig; ++ aufs_bindex_t bsrc_orig; ++ struct dentry *h_d_dst, *h_d_start; ++ struct au_dinfo *dinfo; ++ struct au_hdentry *hdp; ++ ++ dinfo = au_di(cpg->dentry); ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ bsrc_orig = cpg->bsrc; ++ cpg->bsrc = dinfo->di_btop; ++ hdp = dinfo->di_hdentry; ++ h_d_dst = hdp[0 + cpg->bdst].hd_dentry; ++ dinfo->di_btop = cpg->bdst; ++ hdp[0 + cpg->bdst].hd_dentry = wh_dentry; ++ h_d_start = NULL; ++ if (file) { ++ h_d_start = hdp[0 + cpg->bsrc].hd_dentry; ++ hdp[0 + cpg->bsrc].hd_dentry = au_hf_top(file)->f_path.dentry; ++ } ++ flags_orig = cpg->flags; ++ cpg->flags = !AuCpup_DTIME; ++ err = au_cpup_single(cpg, /*h_parent*/NULL); ++ cpg->flags = flags_orig; ++ if (file) { ++ if (!err) ++ err = au_reopen_nondir(file); ++ hdp[0 + cpg->bsrc].hd_dentry = h_d_start; ++ } ++ hdp[0 + cpg->bdst].hd_dentry = h_d_dst; ++ dinfo->di_btop = cpg->bsrc; ++ cpg->bsrc = bsrc_orig; ++ ++ return err; ++} ++ ++static int au_cpup_wh(struct au_cp_generic *cpg, struct file *file) ++{ ++ int err; ++ aufs_bindex_t bdst; ++ struct au_dtime dt; ++ struct dentry *dentry, *parent, *h_parent, *wh_dentry; ++ struct au_branch *br; ++ struct path h_path; ++ ++ dentry = cpg->dentry; ++ bdst = cpg->bdst; ++ br = au_sbr(dentry->d_sb, bdst); ++ parent = dget_parent(dentry); ++ h_parent = au_h_dptr(parent, bdst); ++ wh_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out; ++ ++ h_path.dentry = h_parent; ++ h_path.mnt = au_br_mnt(br); ++ au_dtime_store(&dt, parent, &h_path); ++ err = au_do_cpup_wh(cpg, wh_dentry, file); ++ if (unlikely(err)) ++ goto out_wh; ++ ++ dget(wh_dentry); ++ h_path.dentry = wh_dentry; ++ if (!d_is_dir(wh_dentry)) { ++ /* no delegation since it is just created */ ++ err = vfsub_unlink(d_inode(h_parent), &h_path, ++ /*delegated*/NULL, /*force*/0); ++ } else ++ err = vfsub_rmdir(d_inode(h_parent), &h_path); ++ if (unlikely(err)) { ++ AuIOErr("failed remove copied-up tmp file %pd(%d)\n", ++ wh_dentry, err); ++ err = -EIO; ++ } ++ au_dtime_revert(&dt); ++ au_set_hi_wh(d_inode(dentry), bdst, wh_dentry); ++ ++out_wh: ++ dput(wh_dentry); ++out: ++ dput(parent); ++ return err; ++} ++ ++struct au_cpup_wh_args { ++ int *errp; ++ struct au_cp_generic *cpg; ++ struct file *file; ++}; ++ ++static void au_call_cpup_wh(void *args) ++{ ++ struct au_cpup_wh_args *a = args; ++ ++ au_pin_hdir_acquire_nest(a->cpg->pin); ++ *a->errp = au_cpup_wh(a->cpg, a->file); ++ au_pin_hdir_release(a->cpg->pin); ++} ++ ++int au_sio_cpup_wh(struct au_cp_generic *cpg, struct file *file) ++{ ++ int err, wkq_err; ++ aufs_bindex_t bdst; ++ struct dentry *dentry, *parent, *h_orph, *h_parent; ++ struct inode *dir, *h_dir, *h_tmpdir; ++ struct au_wbr *wbr; ++ struct au_pin wh_pin, *pin_orig; ++ ++ dentry = cpg->dentry; ++ bdst = cpg->bdst; ++ parent = dget_parent(dentry); ++ dir = d_inode(parent); ++ h_orph = NULL; ++ h_parent = NULL; ++ h_dir = au_igrab(au_h_iptr(dir, bdst)); ++ h_tmpdir = h_dir; ++ pin_orig = NULL; ++ if (!h_dir->i_nlink) { ++ wbr = au_sbr(dentry->d_sb, bdst)->br_wbr; ++ h_orph = wbr->wbr_orph; ++ ++ h_parent = dget(au_h_dptr(parent, bdst)); ++ au_set_h_dptr(parent, bdst, dget(h_orph)); ++ h_tmpdir = d_inode(h_orph); ++ au_set_h_iptr(dir, bdst, au_igrab(h_tmpdir), /*flags*/0); ++ ++ inode_lock_nested(h_tmpdir, AuLsc_I_PARENT3); ++ /* todo: au_h_open_pre()? */ ++ ++ pin_orig = cpg->pin; ++ au_pin_init(&wh_pin, dentry, bdst, AuLsc_DI_PARENT, ++ AuLsc_I_PARENT3, cpg->pin->udba, AuPin_DI_LOCKED); ++ cpg->pin = &wh_pin; ++ } ++ ++ if (!au_test_h_perm_sio(h_tmpdir, MAY_EXEC | MAY_WRITE) ++ && !au_cpup_sio_test(cpg->pin, d_inode(dentry)->i_mode)) ++ err = au_cpup_wh(cpg, file); ++ else { ++ struct au_cpup_wh_args args = { ++ .errp = &err, ++ .cpg = cpg, ++ .file = file ++ }; ++ wkq_err = au_wkq_wait(au_call_cpup_wh, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ if (h_orph) { ++ inode_unlock(h_tmpdir); ++ /* todo: au_h_open_post()? */ ++ au_set_h_iptr(dir, bdst, au_igrab(h_dir), /*flags*/0); ++ au_set_h_dptr(parent, bdst, h_parent); ++ AuDebugOn(!pin_orig); ++ cpg->pin = pin_orig; ++ } ++ iput(h_dir); ++ dput(parent); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * generic routine for both of copy-up and copy-down. ++ */ ++/* cf. revalidate function in file.c */ ++int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst, ++ int (*cp)(struct dentry *dentry, aufs_bindex_t bdst, ++ struct au_pin *pin, ++ struct dentry *h_parent, void *arg), ++ void *arg) ++{ ++ int err; ++ struct au_pin pin; ++ struct dentry *d, *parent, *h_parent, *real_parent, *h_dentry; ++ ++ err = 0; ++ parent = dget_parent(dentry); ++ if (IS_ROOT(parent)) ++ goto out; ++ ++ au_pin_init(&pin, dentry, bdst, AuLsc_DI_PARENT2, AuLsc_I_PARENT2, ++ au_opt_udba(dentry->d_sb), AuPin_MNT_WRITE); ++ ++ /* do not use au_dpage */ ++ real_parent = parent; ++ while (1) { ++ dput(parent); ++ parent = dget_parent(dentry); ++ h_parent = au_h_dptr(parent, bdst); ++ if (h_parent) ++ goto out; /* success */ ++ ++ /* find top dir which is necessary to cpup */ ++ do { ++ d = parent; ++ dput(parent); ++ parent = dget_parent(d); ++ di_read_lock_parent3(parent, !AuLock_IR); ++ h_parent = au_h_dptr(parent, bdst); ++ di_read_unlock(parent, !AuLock_IR); ++ } while (!h_parent); ++ ++ if (d != real_parent) ++ di_write_lock_child3(d); ++ ++ /* somebody else might create while we were sleeping */ ++ h_dentry = au_h_dptr(d, bdst); ++ if (!h_dentry || d_is_negative(h_dentry)) { ++ if (h_dentry) ++ au_update_dbtop(d); ++ ++ au_pin_set_dentry(&pin, d); ++ err = au_do_pin(&pin); ++ if (!err) { ++ err = cp(d, bdst, &pin, h_parent, arg); ++ au_unpin(&pin); ++ } ++ } ++ ++ if (d != real_parent) ++ di_write_unlock(d); ++ if (unlikely(err)) ++ break; ++ } ++ ++out: ++ dput(parent); ++ return err; ++} ++ ++static int au_cpup_dir(struct dentry *dentry, aufs_bindex_t bdst, ++ struct au_pin *pin, ++ struct dentry *h_parent __maybe_unused, ++ void *arg __maybe_unused) ++{ ++ struct au_cp_generic cpg = { ++ .dentry = dentry, ++ .bdst = bdst, ++ .bsrc = -1, ++ .len = 0, ++ .pin = pin, ++ .flags = AuCpup_DTIME ++ }; ++ return au_sio_cpup_simple(&cpg); ++} ++ ++int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst) ++{ ++ return au_cp_dirs(dentry, bdst, au_cpup_dir, NULL); ++} ++ ++int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst) ++{ ++ int err; ++ struct dentry *parent; ++ struct inode *dir; ++ ++ parent = dget_parent(dentry); ++ dir = d_inode(parent); ++ err = 0; ++ if (au_h_iptr(dir, bdst)) ++ goto out; ++ ++ di_read_unlock(parent, AuLock_IR); ++ di_write_lock_parent(parent); ++ /* someone else might change our inode while we were sleeping */ ++ if (!au_h_iptr(dir, bdst)) ++ err = au_cpup_dirs(dentry, bdst); ++ di_downgrade_lock(parent, AuLock_IR); ++ ++out: ++ dput(parent); ++ return err; ++} +diff --git a/fs/aufs/cpup.h b/fs/aufs/cpup.h +new file mode 100644 +index 0000000..7721429 +--- /dev/null ++++ b/fs/aufs/cpup.h +@@ -0,0 +1,94 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * copy-up/down functions ++ */ ++ ++#ifndef __AUFS_CPUP_H__ ++#define __AUFS_CPUP_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++struct inode; ++struct file; ++struct au_pin; ++ ++void au_cpup_attr_flags(struct inode *dst, unsigned int iflags); ++void au_cpup_attr_timesizes(struct inode *inode); ++void au_cpup_attr_nlink(struct inode *inode, int force); ++void au_cpup_attr_changeable(struct inode *inode); ++void au_cpup_igen(struct inode *inode, struct inode *h_inode); ++void au_cpup_attr_all(struct inode *inode, int force); ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_cp_generic { ++ struct dentry *dentry; ++ aufs_bindex_t bdst, bsrc; ++ loff_t len; ++ struct au_pin *pin; ++ unsigned int flags; ++}; ++ ++/* cpup flags */ ++#define AuCpup_DTIME 1 /* do dtime_store/revert */ ++#define AuCpup_KEEPLINO (1 << 1) /* do not clear the lower xino, ++ for link(2) */ ++#define AuCpup_RENAME (1 << 2) /* rename after cpup */ ++#define AuCpup_HOPEN (1 << 3) /* call h_open_pre/post() in ++ cpup */ ++#define AuCpup_OVERWRITE (1 << 4) /* allow overwriting the ++ existing entry */ ++#define AuCpup_RWDST (1 << 5) /* force write target even if ++ the branch is marked as RO */ ++ ++#define au_ftest_cpup(flags, name) ((flags) & AuCpup_##name) ++#define au_fset_cpup(flags, name) \ ++ do { (flags) |= AuCpup_##name; } while (0) ++#define au_fclr_cpup(flags, name) \ ++ do { (flags) &= ~AuCpup_##name; } while (0) ++ ++int au_copy_file(struct file *dst, struct file *src, loff_t len); ++int au_sio_cpup_simple(struct au_cp_generic *cpg); ++int au_sio_cpdown_simple(struct au_cp_generic *cpg); ++int au_sio_cpup_wh(struct au_cp_generic *cpg, struct file *file); ++ ++int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst, ++ int (*cp)(struct dentry *dentry, aufs_bindex_t bdst, ++ struct au_pin *pin, ++ struct dentry *h_parent, void *arg), ++ void *arg); ++int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst); ++int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* keep timestamps when copyup */ ++struct au_dtime { ++ struct dentry *dt_dentry; ++ struct path dt_h_path; ++ struct timespec dt_atime, dt_mtime; ++}; ++void au_dtime_store(struct au_dtime *dt, struct dentry *dentry, ++ struct path *h_path); ++void au_dtime_revert(struct au_dtime *dt); ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_CPUP_H__ */ +diff --git a/fs/aufs/dbgaufs.c b/fs/aufs/dbgaufs.c +new file mode 100644 +index 0000000..ef297ab +--- /dev/null ++++ b/fs/aufs/dbgaufs.c +@@ -0,0 +1,432 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * debugfs interface ++ */ ++ ++#include ++#include "aufs.h" ++ ++#ifndef CONFIG_SYSFS ++#error DEBUG_FS depends upon SYSFS ++#endif ++ ++static struct dentry *dbgaufs; ++static const mode_t dbgaufs_mode = S_IRUSR | S_IRGRP | S_IROTH; ++ ++/* 20 is max digits length of ulong 64 */ ++struct dbgaufs_arg { ++ int n; ++ char a[20 * 4]; ++}; ++ ++/* ++ * common function for all XINO files ++ */ ++static int dbgaufs_xi_release(struct inode *inode __maybe_unused, ++ struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static int dbgaufs_xi_open(struct file *xf, struct file *file, int do_fcnt) ++{ ++ int err; ++ struct kstat st; ++ struct dbgaufs_arg *p; ++ ++ err = -ENOMEM; ++ p = kmalloc(sizeof(*p), GFP_NOFS); ++ if (unlikely(!p)) ++ goto out; ++ ++ err = 0; ++ p->n = 0; ++ file->private_data = p; ++ if (!xf) ++ goto out; ++ ++ err = vfs_getattr(&xf->f_path, &st); ++ if (!err) { ++ if (do_fcnt) ++ p->n = snprintf ++ (p->a, sizeof(p->a), "%ld, %llux%lu %lld\n", ++ (long)file_count(xf), st.blocks, st.blksize, ++ (long long)st.size); ++ else ++ p->n = snprintf(p->a, sizeof(p->a), "%llux%lu %lld\n", ++ st.blocks, st.blksize, ++ (long long)st.size); ++ AuDebugOn(p->n >= sizeof(p->a)); ++ } else { ++ p->n = snprintf(p->a, sizeof(p->a), "err %d\n", err); ++ err = 0; ++ } ++ ++out: ++ return err; ++ ++} ++ ++static ssize_t dbgaufs_xi_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ struct dbgaufs_arg *p; ++ ++ p = file->private_data; ++ return simple_read_from_buffer(buf, count, ppos, p->a, p->n); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct dbgaufs_plink_arg { ++ int n; ++ char a[]; ++}; ++ ++static int dbgaufs_plink_release(struct inode *inode __maybe_unused, ++ struct file *file) ++{ ++ free_page((unsigned long)file->private_data); ++ return 0; ++} ++ ++static int dbgaufs_plink_open(struct inode *inode, struct file *file) ++{ ++ int err, i, limit; ++ unsigned long n, sum; ++ struct dbgaufs_plink_arg *p; ++ struct au_sbinfo *sbinfo; ++ struct super_block *sb; ++ struct au_sphlhead *sphl; ++ ++ err = -ENOMEM; ++ p = (void *)get_zeroed_page(GFP_NOFS); ++ if (unlikely(!p)) ++ goto out; ++ ++ err = -EFBIG; ++ sbinfo = inode->i_private; ++ sb = sbinfo->si_sb; ++ si_noflush_read_lock(sb); ++ if (au_opt_test(au_mntflags(sb), PLINK)) { ++ limit = PAGE_SIZE - sizeof(p->n); ++ ++ /* the number of buckets */ ++ n = snprintf(p->a + p->n, limit, "%d\n", AuPlink_NHASH); ++ p->n += n; ++ limit -= n; ++ ++ sum = 0; ++ for (i = 0, sphl = sbinfo->si_plink; ++ i < AuPlink_NHASH; ++ i++, sphl++) { ++ n = au_sphl_count(sphl); ++ sum += n; ++ ++ n = snprintf(p->a + p->n, limit, "%lu ", n); ++ p->n += n; ++ limit -= n; ++ if (unlikely(limit <= 0)) ++ goto out_free; ++ } ++ p->a[p->n - 1] = '\n'; ++ ++ /* the sum of plinks */ ++ n = snprintf(p->a + p->n, limit, "%lu\n", sum); ++ p->n += n; ++ limit -= n; ++ if (unlikely(limit <= 0)) ++ goto out_free; ++ } else { ++#define str "1\n0\n0\n" ++ p->n = sizeof(str) - 1; ++ strcpy(p->a, str); ++#undef str ++ } ++ si_read_unlock(sb); ++ ++ err = 0; ++ file->private_data = p; ++ goto out; /* success */ ++ ++out_free: ++ free_page((unsigned long)p); ++out: ++ return err; ++} ++ ++static ssize_t dbgaufs_plink_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ struct dbgaufs_plink_arg *p; ++ ++ p = file->private_data; ++ return simple_read_from_buffer(buf, count, ppos, p->a, p->n); ++} ++ ++static const struct file_operations dbgaufs_plink_fop = { ++ .owner = THIS_MODULE, ++ .open = dbgaufs_plink_open, ++ .release = dbgaufs_plink_release, ++ .read = dbgaufs_plink_read ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int dbgaufs_xib_open(struct inode *inode, struct file *file) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ struct super_block *sb; ++ ++ sbinfo = inode->i_private; ++ sb = sbinfo->si_sb; ++ si_noflush_read_lock(sb); ++ err = dbgaufs_xi_open(sbinfo->si_xib, file, /*do_fcnt*/0); ++ si_read_unlock(sb); ++ return err; ++} ++ ++static const struct file_operations dbgaufs_xib_fop = { ++ .owner = THIS_MODULE, ++ .open = dbgaufs_xib_open, ++ .release = dbgaufs_xi_release, ++ .read = dbgaufs_xi_read ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define DbgaufsXi_PREFIX "xi" ++ ++static int dbgaufs_xino_open(struct inode *inode, struct file *file) ++{ ++ int err; ++ long l; ++ struct au_sbinfo *sbinfo; ++ struct super_block *sb; ++ struct file *xf; ++ struct qstr *name; ++ ++ err = -ENOENT; ++ xf = NULL; ++ name = &file->f_path.dentry->d_name; ++ if (unlikely(name->len < sizeof(DbgaufsXi_PREFIX) ++ || memcmp(name->name, DbgaufsXi_PREFIX, ++ sizeof(DbgaufsXi_PREFIX) - 1))) ++ goto out; ++ err = kstrtol(name->name + sizeof(DbgaufsXi_PREFIX) - 1, 10, &l); ++ if (unlikely(err)) ++ goto out; ++ ++ sbinfo = inode->i_private; ++ sb = sbinfo->si_sb; ++ si_noflush_read_lock(sb); ++ if (l <= au_sbbot(sb)) { ++ xf = au_sbr(sb, (aufs_bindex_t)l)->br_xino.xi_file; ++ err = dbgaufs_xi_open(xf, file, /*do_fcnt*/1); ++ } else ++ err = -ENOENT; ++ si_read_unlock(sb); ++ ++out: ++ return err; ++} ++ ++static const struct file_operations dbgaufs_xino_fop = { ++ .owner = THIS_MODULE, ++ .open = dbgaufs_xino_open, ++ .release = dbgaufs_xi_release, ++ .read = dbgaufs_xi_read ++}; ++ ++void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ aufs_bindex_t bbot; ++ struct au_branch *br; ++ struct au_xino_file *xi; ++ ++ if (!au_sbi(sb)->si_dbgaufs) ++ return; ++ ++ bbot = au_sbbot(sb); ++ for (; bindex <= bbot; bindex++) { ++ br = au_sbr(sb, bindex); ++ xi = &br->br_xino; ++ debugfs_remove(xi->xi_dbgaufs); ++ xi->xi_dbgaufs = NULL; ++ } ++} ++ ++void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ struct au_sbinfo *sbinfo; ++ struct dentry *parent; ++ struct au_branch *br; ++ struct au_xino_file *xi; ++ aufs_bindex_t bbot; ++ char name[sizeof(DbgaufsXi_PREFIX) + 5]; /* "xi" bindex NULL */ ++ ++ sbinfo = au_sbi(sb); ++ parent = sbinfo->si_dbgaufs; ++ if (!parent) ++ return; ++ ++ bbot = au_sbbot(sb); ++ for (; bindex <= bbot; bindex++) { ++ snprintf(name, sizeof(name), DbgaufsXi_PREFIX "%d", bindex); ++ br = au_sbr(sb, bindex); ++ xi = &br->br_xino; ++ AuDebugOn(xi->xi_dbgaufs); ++ xi->xi_dbgaufs = debugfs_create_file(name, dbgaufs_mode, parent, ++ sbinfo, &dbgaufs_xino_fop); ++ /* ignore an error */ ++ if (unlikely(!xi->xi_dbgaufs)) ++ AuWarn1("failed %s under debugfs\n", name); ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_EXPORT ++static int dbgaufs_xigen_open(struct inode *inode, struct file *file) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ struct super_block *sb; ++ ++ sbinfo = inode->i_private; ++ sb = sbinfo->si_sb; ++ si_noflush_read_lock(sb); ++ err = dbgaufs_xi_open(sbinfo->si_xigen, file, /*do_fcnt*/0); ++ si_read_unlock(sb); ++ return err; ++} ++ ++static const struct file_operations dbgaufs_xigen_fop = { ++ .owner = THIS_MODULE, ++ .open = dbgaufs_xigen_open, ++ .release = dbgaufs_xi_release, ++ .read = dbgaufs_xi_read ++}; ++ ++static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo) ++{ ++ int err; ++ ++ /* ++ * This function is a dynamic '__init' function actually, ++ * so the tiny check for si_rwsem is unnecessary. ++ */ ++ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ ++ ++ err = -EIO; ++ sbinfo->si_dbgaufs_xigen = debugfs_create_file ++ ("xigen", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, ++ &dbgaufs_xigen_fop); ++ if (sbinfo->si_dbgaufs_xigen) ++ err = 0; ++ ++ return err; ++} ++#else ++static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo) ++{ ++ return 0; ++} ++#endif /* CONFIG_AUFS_EXPORT */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++void dbgaufs_si_fin(struct au_sbinfo *sbinfo) ++{ ++ /* ++ * This function is a dynamic '__fin' function actually, ++ * so the tiny check for si_rwsem is unnecessary. ++ */ ++ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ ++ ++ debugfs_remove_recursive(sbinfo->si_dbgaufs); ++ sbinfo->si_dbgaufs = NULL; ++ kobject_put(&sbinfo->si_kobj); ++} ++ ++int dbgaufs_si_init(struct au_sbinfo *sbinfo) ++{ ++ int err; ++ char name[SysaufsSiNameLen]; ++ ++ /* ++ * This function is a dynamic '__init' function actually, ++ * so the tiny check for si_rwsem is unnecessary. ++ */ ++ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ ++ ++ err = -ENOENT; ++ if (!dbgaufs) { ++ AuErr1("/debug/aufs is uninitialized\n"); ++ goto out; ++ } ++ ++ err = -EIO; ++ sysaufs_name(sbinfo, name); ++ sbinfo->si_dbgaufs = debugfs_create_dir(name, dbgaufs); ++ if (unlikely(!sbinfo->si_dbgaufs)) ++ goto out; ++ kobject_get(&sbinfo->si_kobj); ++ ++ sbinfo->si_dbgaufs_xib = debugfs_create_file ++ ("xib", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, ++ &dbgaufs_xib_fop); ++ if (unlikely(!sbinfo->si_dbgaufs_xib)) ++ goto out_dir; ++ ++ sbinfo->si_dbgaufs_plink = debugfs_create_file ++ ("plink", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, ++ &dbgaufs_plink_fop); ++ if (unlikely(!sbinfo->si_dbgaufs_plink)) ++ goto out_dir; ++ ++ err = dbgaufs_xigen_init(sbinfo); ++ if (!err) ++ goto out; /* success */ ++ ++out_dir: ++ dbgaufs_si_fin(sbinfo); ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void dbgaufs_fin(void) ++{ ++ debugfs_remove(dbgaufs); ++} ++ ++int __init dbgaufs_init(void) ++{ ++ int err; ++ ++ err = -EIO; ++ dbgaufs = debugfs_create_dir(AUFS_NAME, NULL); ++ if (dbgaufs) ++ err = 0; ++ return err; ++} +diff --git a/fs/aufs/dbgaufs.h b/fs/aufs/dbgaufs.h +new file mode 100644 +index 0000000..d1e09bd +--- /dev/null ++++ b/fs/aufs/dbgaufs.h +@@ -0,0 +1,48 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * debugfs interface ++ */ ++ ++#ifndef __DBGAUFS_H__ ++#define __DBGAUFS_H__ ++ ++#ifdef __KERNEL__ ++ ++struct super_block; ++struct au_sbinfo; ++ ++#ifdef CONFIG_DEBUG_FS ++/* dbgaufs.c */ ++void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex); ++void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex); ++void dbgaufs_si_fin(struct au_sbinfo *sbinfo); ++int dbgaufs_si_init(struct au_sbinfo *sbinfo); ++void dbgaufs_fin(void); ++int __init dbgaufs_init(void); ++#else ++AuStubVoid(dbgaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex) ++AuStubVoid(dbgaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex) ++AuStubVoid(dbgaufs_si_fin, struct au_sbinfo *sbinfo) ++AuStubInt0(dbgaufs_si_init, struct au_sbinfo *sbinfo) ++AuStubVoid(dbgaufs_fin, void) ++AuStubInt0(__init dbgaufs_init, void) ++#endif /* CONFIG_DEBUG_FS */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __DBGAUFS_H__ */ +diff --git a/fs/aufs/dcsub.c b/fs/aufs/dcsub.c +new file mode 100644 +index 0000000..832baa4 +--- /dev/null ++++ b/fs/aufs/dcsub.c +@@ -0,0 +1,224 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * sub-routines for dentry cache ++ */ ++ ++#include "aufs.h" ++ ++static void au_dpage_free(struct au_dpage *dpage) ++{ ++ int i; ++ struct dentry **p; ++ ++ p = dpage->dentries; ++ for (i = 0; i < dpage->ndentry; i++) ++ dput(*p++); ++ free_page((unsigned long)dpage->dentries); ++} ++ ++int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp) ++{ ++ int err; ++ void *p; ++ ++ err = -ENOMEM; ++ dpages->dpages = kmalloc(sizeof(*dpages->dpages), gfp); ++ if (unlikely(!dpages->dpages)) ++ goto out; ++ ++ p = (void *)__get_free_page(gfp); ++ if (unlikely(!p)) ++ goto out_dpages; ++ ++ dpages->dpages[0].ndentry = 0; ++ dpages->dpages[0].dentries = p; ++ dpages->ndpage = 1; ++ return 0; /* success */ ++ ++out_dpages: ++ kfree(dpages->dpages); ++out: ++ return err; ++} ++ ++void au_dpages_free(struct au_dcsub_pages *dpages) ++{ ++ int i; ++ struct au_dpage *p; ++ ++ p = dpages->dpages; ++ for (i = 0; i < dpages->ndpage; i++) ++ au_dpage_free(p++); ++ kfree(dpages->dpages); ++} ++ ++static int au_dpages_append(struct au_dcsub_pages *dpages, ++ struct dentry *dentry, gfp_t gfp) ++{ ++ int err, sz; ++ struct au_dpage *dpage; ++ void *p; ++ ++ dpage = dpages->dpages + dpages->ndpage - 1; ++ sz = PAGE_SIZE / sizeof(dentry); ++ if (unlikely(dpage->ndentry >= sz)) { ++ AuLabel(new dpage); ++ err = -ENOMEM; ++ sz = dpages->ndpage * sizeof(*dpages->dpages); ++ p = au_kzrealloc(dpages->dpages, sz, ++ sz + sizeof(*dpages->dpages), gfp); ++ if (unlikely(!p)) ++ goto out; ++ ++ dpages->dpages = p; ++ dpage = dpages->dpages + dpages->ndpage; ++ p = (void *)__get_free_page(gfp); ++ if (unlikely(!p)) ++ goto out; ++ ++ dpage->ndentry = 0; ++ dpage->dentries = p; ++ dpages->ndpage++; ++ } ++ ++ AuDebugOn(au_dcount(dentry) <= 0); ++ dpage->dentries[dpage->ndentry++] = dget_dlock(dentry); ++ return 0; /* success */ ++ ++out: ++ return err; ++} ++ ++/* todo: BAD approach */ ++/* copied from linux/fs/dcache.c */ ++enum d_walk_ret { ++ D_WALK_CONTINUE, ++ D_WALK_QUIT, ++ D_WALK_NORETRY, ++ D_WALK_SKIP, ++}; ++ ++extern void d_walk(struct dentry *parent, void *data, ++ enum d_walk_ret (*enter)(void *, struct dentry *), ++ void (*finish)(void *)); ++ ++struct ac_dpages_arg { ++ int err; ++ struct au_dcsub_pages *dpages; ++ struct super_block *sb; ++ au_dpages_test test; ++ void *arg; ++}; ++ ++static enum d_walk_ret au_call_dpages_append(void *_arg, struct dentry *dentry) ++{ ++ enum d_walk_ret ret; ++ struct ac_dpages_arg *arg = _arg; ++ ++ ret = D_WALK_CONTINUE; ++ if (dentry->d_sb == arg->sb ++ && !IS_ROOT(dentry) ++ && au_dcount(dentry) > 0 ++ && au_di(dentry) ++ && (!arg->test || arg->test(dentry, arg->arg))) { ++ arg->err = au_dpages_append(arg->dpages, dentry, GFP_ATOMIC); ++ if (unlikely(arg->err)) ++ ret = D_WALK_QUIT; ++ } ++ ++ return ret; ++} ++ ++int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root, ++ au_dpages_test test, void *arg) ++{ ++ struct ac_dpages_arg args = { ++ .err = 0, ++ .dpages = dpages, ++ .sb = root->d_sb, ++ .test = test, ++ .arg = arg ++ }; ++ ++ d_walk(root, &args, au_call_dpages_append, NULL); ++ ++ return args.err; ++} ++ ++int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry, ++ int do_include, au_dpages_test test, void *arg) ++{ ++ int err; ++ ++ err = 0; ++ write_seqlock(&rename_lock); ++ spin_lock(&dentry->d_lock); ++ if (do_include ++ && au_dcount(dentry) > 0 ++ && (!test || test(dentry, arg))) ++ err = au_dpages_append(dpages, dentry, GFP_ATOMIC); ++ spin_unlock(&dentry->d_lock); ++ if (unlikely(err)) ++ goto out; ++ ++ /* ++ * RCU for vfsmount is unnecessary since this is a traverse in a single ++ * mount ++ */ ++ while (!IS_ROOT(dentry)) { ++ dentry = dentry->d_parent; /* rename_lock is locked */ ++ spin_lock(&dentry->d_lock); ++ if (au_dcount(dentry) > 0 ++ && (!test || test(dentry, arg))) ++ err = au_dpages_append(dpages, dentry, GFP_ATOMIC); ++ spin_unlock(&dentry->d_lock); ++ if (unlikely(err)) ++ break; ++ } ++ ++out: ++ write_sequnlock(&rename_lock); ++ return err; ++} ++ ++static inline int au_dcsub_dpages_aufs(struct dentry *dentry, void *arg) ++{ ++ return au_di(dentry) && dentry->d_sb == arg; ++} ++ ++int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages, ++ struct dentry *dentry, int do_include) ++{ ++ return au_dcsub_pages_rev(dpages, dentry, do_include, ++ au_dcsub_dpages_aufs, dentry->d_sb); ++} ++ ++int au_test_subdir(struct dentry *d1, struct dentry *d2) ++{ ++ struct path path[2] = { ++ { ++ .dentry = d1 ++ }, ++ { ++ .dentry = d2 ++ } ++ }; ++ ++ return path_is_under(path + 0, path + 1); ++} +diff --git a/fs/aufs/dcsub.h b/fs/aufs/dcsub.h +new file mode 100644 +index 0000000..9f4a2b5 +--- /dev/null ++++ b/fs/aufs/dcsub.h +@@ -0,0 +1,136 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * sub-routines for dentry cache ++ */ ++ ++#ifndef __AUFS_DCSUB_H__ ++#define __AUFS_DCSUB_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++ ++struct au_dpage { ++ int ndentry; ++ struct dentry **dentries; ++}; ++ ++struct au_dcsub_pages { ++ int ndpage; ++ struct au_dpage *dpages; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* dcsub.c */ ++int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp); ++void au_dpages_free(struct au_dcsub_pages *dpages); ++typedef int (*au_dpages_test)(struct dentry *dentry, void *arg); ++int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root, ++ au_dpages_test test, void *arg); ++int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry, ++ int do_include, au_dpages_test test, void *arg); ++int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages, ++ struct dentry *dentry, int do_include); ++int au_test_subdir(struct dentry *d1, struct dentry *d2); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * todo: in linux-3.13, several similar (but faster) helpers are added to ++ * include/linux/dcache.h. Try them (in the future). ++ */ ++ ++static inline int au_d_hashed_positive(struct dentry *d) ++{ ++ int err; ++ struct inode *inode = d_inode(d); ++ ++ err = 0; ++ if (unlikely(d_unhashed(d) ++ || d_is_negative(d) ++ || !inode->i_nlink)) ++ err = -ENOENT; ++ return err; ++} ++ ++static inline int au_d_linkable(struct dentry *d) ++{ ++ int err; ++ struct inode *inode = d_inode(d); ++ ++ err = au_d_hashed_positive(d); ++ if (err ++ && d_is_positive(d) ++ && (inode->i_state & I_LINKABLE)) ++ err = 0; ++ return err; ++} ++ ++static inline int au_d_alive(struct dentry *d) ++{ ++ int err; ++ struct inode *inode; ++ ++ err = 0; ++ if (!IS_ROOT(d)) ++ err = au_d_hashed_positive(d); ++ else { ++ inode = d_inode(d); ++ if (unlikely(d_unlinked(d) ++ || d_is_negative(d) ++ || !inode->i_nlink)) ++ err = -ENOENT; ++ } ++ return err; ++} ++ ++static inline int au_alive_dir(struct dentry *d) ++{ ++ int err; ++ ++ err = au_d_alive(d); ++ if (unlikely(err || IS_DEADDIR(d_inode(d)))) ++ err = -ENOENT; ++ return err; ++} ++ ++static inline int au_qstreq(struct qstr *a, struct qstr *b) ++{ ++ return a->len == b->len ++ && !memcmp(a->name, b->name, a->len); ++} ++ ++/* ++ * by the commit ++ * 360f547 2015-01-25 dcache: let the dentry count go down to zero without ++ * taking d_lock ++ * the type of d_lockref.count became int, but the inlined function d_count() ++ * still returns unsigned int. ++ * I don't know why. Maybe it is for every d_count() users? ++ * Anyway au_dcount() lives on. ++ */ ++static inline int au_dcount(struct dentry *d) ++{ ++ return (int)d_count(d); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DCSUB_H__ */ +diff --git a/fs/aufs/debug.c b/fs/aufs/debug.c +new file mode 100644 +index 0000000..5fa12e9 +--- /dev/null ++++ b/fs/aufs/debug.c +@@ -0,0 +1,442 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * debug print functions ++ */ ++ ++#include "aufs.h" ++ ++/* Returns 0, or -errno. arg is in kp->arg. */ ++static int param_atomic_t_set(const char *val, const struct kernel_param *kp) ++{ ++ int err, n; ++ ++ err = kstrtoint(val, 0, &n); ++ if (!err) { ++ if (n > 0) ++ au_debug_on(); ++ else ++ au_debug_off(); ++ } ++ return err; ++} ++ ++/* Returns length written or -errno. Buffer is 4k (ie. be short!) */ ++static int param_atomic_t_get(char *buffer, const struct kernel_param *kp) ++{ ++ atomic_t *a; ++ ++ a = kp->arg; ++ return sprintf(buffer, "%d", atomic_read(a)); ++} ++ ++static struct kernel_param_ops param_ops_atomic_t = { ++ .set = param_atomic_t_set, ++ .get = param_atomic_t_get ++ /* void (*free)(void *arg) */ ++}; ++ ++atomic_t aufs_debug = ATOMIC_INIT(0); ++MODULE_PARM_DESC(debug, "debug print"); ++module_param_named(debug, aufs_debug, atomic_t, S_IRUGO | S_IWUSR | S_IWGRP); ++ ++DEFINE_MUTEX(au_dbg_mtx); /* just to serialize the dbg msgs */ ++char *au_plevel = KERN_DEBUG; ++#define dpri(fmt, ...) do { \ ++ if ((au_plevel \ ++ && strcmp(au_plevel, KERN_DEBUG)) \ ++ || au_debug_test()) \ ++ printk("%s" fmt, au_plevel, ##__VA_ARGS__); \ ++} while (0) ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_dpri_whlist(struct au_nhash *whlist) ++{ ++ unsigned long ul, n; ++ struct hlist_head *head; ++ struct au_vdir_wh *pos; ++ ++ n = whlist->nh_num; ++ head = whlist->nh_head; ++ for (ul = 0; ul < n; ul++) { ++ hlist_for_each_entry(pos, head, wh_hash) ++ dpri("b%d, %.*s, %d\n", ++ pos->wh_bindex, ++ pos->wh_str.len, pos->wh_str.name, ++ pos->wh_str.len); ++ head++; ++ } ++} ++ ++void au_dpri_vdir(struct au_vdir *vdir) ++{ ++ unsigned long ul; ++ union au_vdir_deblk_p p; ++ unsigned char *o; ++ ++ if (!vdir || IS_ERR(vdir)) { ++ dpri("err %ld\n", PTR_ERR(vdir)); ++ return; ++ } ++ ++ dpri("deblk %u, nblk %lu, deblk %p, last{%lu, %p}, ver %lu\n", ++ vdir->vd_deblk_sz, vdir->vd_nblk, vdir->vd_deblk, ++ vdir->vd_last.ul, vdir->vd_last.p.deblk, vdir->vd_version); ++ for (ul = 0; ul < vdir->vd_nblk; ul++) { ++ p.deblk = vdir->vd_deblk[ul]; ++ o = p.deblk; ++ dpri("[%lu]: %p\n", ul, o); ++ } ++} ++ ++static int do_pri_inode(aufs_bindex_t bindex, struct inode *inode, int hn, ++ struct dentry *wh) ++{ ++ char *n = NULL; ++ int l = 0; ++ ++ if (!inode || IS_ERR(inode)) { ++ dpri("i%d: err %ld\n", bindex, PTR_ERR(inode)); ++ return -1; ++ } ++ ++ /* the type of i_blocks depends upon CONFIG_LBDAF */ ++ BUILD_BUG_ON(sizeof(inode->i_blocks) != sizeof(unsigned long) ++ && sizeof(inode->i_blocks) != sizeof(u64)); ++ if (wh) { ++ n = (void *)wh->d_name.name; ++ l = wh->d_name.len; ++ } ++ ++ dpri("i%d: %p, i%lu, %s, cnt %d, nl %u, 0%o, sz %llu, blk %llu," ++ " hn %d, ct %lld, np %lu, st 0x%lx, f 0x%x, v %llu, g %x%s%.*s\n", ++ bindex, inode, ++ inode->i_ino, inode->i_sb ? au_sbtype(inode->i_sb) : "??", ++ atomic_read(&inode->i_count), inode->i_nlink, inode->i_mode, ++ i_size_read(inode), (unsigned long long)inode->i_blocks, ++ hn, (long long)timespec_to_ns(&inode->i_ctime) & 0x0ffff, ++ inode->i_mapping ? inode->i_mapping->nrpages : 0, ++ inode->i_state, inode->i_flags, inode->i_version, ++ inode->i_generation, ++ l ? ", wh " : "", l, n); ++ return 0; ++} ++ ++void au_dpri_inode(struct inode *inode) ++{ ++ struct au_iinfo *iinfo; ++ struct au_hinode *hi; ++ aufs_bindex_t bindex; ++ int err, hn; ++ ++ err = do_pri_inode(-1, inode, -1, NULL); ++ if (err || !au_test_aufs(inode->i_sb) || is_bad_inode(inode)) ++ return; ++ ++ iinfo = au_ii(inode); ++ dpri("i-1: btop %d, bbot %d, gen %d\n", ++ iinfo->ii_btop, iinfo->ii_bbot, au_iigen(inode, NULL)); ++ if (iinfo->ii_btop < 0) ++ return; ++ hn = 0; ++ for (bindex = iinfo->ii_btop; bindex <= iinfo->ii_bbot; bindex++) { ++ hi = au_hinode(iinfo, bindex); ++ hn = !!au_hn(hi); ++ do_pri_inode(bindex, hi->hi_inode, hn, hi->hi_whdentry); ++ } ++} ++ ++void au_dpri_dalias(struct inode *inode) ++{ ++ struct dentry *d; ++ ++ spin_lock(&inode->i_lock); ++ hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias) ++ au_dpri_dentry(d); ++ spin_unlock(&inode->i_lock); ++} ++ ++static int do_pri_dentry(aufs_bindex_t bindex, struct dentry *dentry) ++{ ++ struct dentry *wh = NULL; ++ int hn; ++ struct au_iinfo *iinfo; ++ struct au_hinode *hi; ++ struct inode *inode; ++ ++ if (!dentry || IS_ERR(dentry)) { ++ dpri("d%d: err %ld\n", bindex, PTR_ERR(dentry)); ++ return -1; ++ } ++ /* do not call dget_parent() here */ ++ /* note: access d_xxx without d_lock */ ++ dpri("d%d: %p, %pd2?, %s, cnt %d, flags 0x%x, %shashed\n", ++ bindex, dentry, dentry, ++ dentry->d_sb ? au_sbtype(dentry->d_sb) : "??", ++ au_dcount(dentry), dentry->d_flags, ++ d_unhashed(dentry) ? "un" : ""); ++ hn = -1; ++ inode = NULL; ++ if (bindex >= 0 ++ && d_is_positive(dentry) ++ && au_test_aufs(dentry->d_sb)) ++ inode = d_inode(dentry); ++ if (inode && !is_bad_inode(inode)) { ++ iinfo = au_ii(inode); ++ hi = au_hinode(iinfo, bindex); ++ hn = !!au_hn(hi); ++ wh = hi->hi_whdentry; ++ } ++ do_pri_inode(bindex, inode, hn, wh); ++ return 0; ++} ++ ++void au_dpri_dentry(struct dentry *dentry) ++{ ++ struct au_dinfo *dinfo; ++ aufs_bindex_t bindex; ++ int err; ++ struct au_hdentry *hdp; ++ ++ err = do_pri_dentry(-1, dentry); ++ if (err || !au_test_aufs(dentry->d_sb)) ++ return; ++ ++ dinfo = au_di(dentry); ++ if (!dinfo) ++ return; ++ dpri("d-1: btop %d, bbot %d, bwh %d, bdiropq %d, gen %d, tmp %d\n", ++ dinfo->di_btop, dinfo->di_bbot, ++ dinfo->di_bwh, dinfo->di_bdiropq, au_digen(dentry), ++ dinfo->di_tmpfile); ++ if (dinfo->di_btop < 0) ++ return; ++ hdp = dinfo->di_hdentry; ++ for (bindex = dinfo->di_btop; bindex <= dinfo->di_bbot; bindex++) ++ do_pri_dentry(bindex, hdp[0 + bindex].hd_dentry); ++} ++ ++static int do_pri_file(aufs_bindex_t bindex, struct file *file) ++{ ++ char a[32]; ++ ++ if (!file || IS_ERR(file)) { ++ dpri("f%d: err %ld\n", bindex, PTR_ERR(file)); ++ return -1; ++ } ++ a[0] = 0; ++ if (bindex < 0 ++ && !IS_ERR_OR_NULL(file->f_path.dentry) ++ && au_test_aufs(file->f_path.dentry->d_sb) ++ && au_fi(file)) ++ snprintf(a, sizeof(a), ", gen %d, mmapped %d", ++ au_figen(file), atomic_read(&au_fi(file)->fi_mmapped)); ++ dpri("f%d: mode 0x%x, flags 0%o, cnt %ld, v %llu, pos %llu%s\n", ++ bindex, file->f_mode, file->f_flags, (long)file_count(file), ++ file->f_version, file->f_pos, a); ++ if (!IS_ERR_OR_NULL(file->f_path.dentry)) ++ do_pri_dentry(bindex, file->f_path.dentry); ++ return 0; ++} ++ ++void au_dpri_file(struct file *file) ++{ ++ struct au_finfo *finfo; ++ struct au_fidir *fidir; ++ struct au_hfile *hfile; ++ aufs_bindex_t bindex; ++ int err; ++ ++ err = do_pri_file(-1, file); ++ if (err ++ || IS_ERR_OR_NULL(file->f_path.dentry) ++ || !au_test_aufs(file->f_path.dentry->d_sb)) ++ return; ++ ++ finfo = au_fi(file); ++ if (!finfo) ++ return; ++ if (finfo->fi_btop < 0) ++ return; ++ fidir = finfo->fi_hdir; ++ if (!fidir) ++ do_pri_file(finfo->fi_btop, finfo->fi_htop.hf_file); ++ else ++ for (bindex = finfo->fi_btop; ++ bindex >= 0 && bindex <= fidir->fd_bbot; ++ bindex++) { ++ hfile = fidir->fd_hfile + bindex; ++ do_pri_file(bindex, hfile ? hfile->hf_file : NULL); ++ } ++} ++ ++static int do_pri_br(aufs_bindex_t bindex, struct au_branch *br) ++{ ++ struct vfsmount *mnt; ++ struct super_block *sb; ++ ++ if (!br || IS_ERR(br)) ++ goto out; ++ mnt = au_br_mnt(br); ++ if (!mnt || IS_ERR(mnt)) ++ goto out; ++ sb = mnt->mnt_sb; ++ if (!sb || IS_ERR(sb)) ++ goto out; ++ ++ dpri("s%d: {perm 0x%x, id %d, cnt %lld, wbr %p}, " ++ "%s, dev 0x%02x%02x, flags 0x%lx, cnt %d, active %d, " ++ "xino %d\n", ++ bindex, br->br_perm, br->br_id, au_br_count(br), ++ br->br_wbr, au_sbtype(sb), MAJOR(sb->s_dev), MINOR(sb->s_dev), ++ sb->s_flags, sb->s_count, ++ atomic_read(&sb->s_active), !!br->br_xino.xi_file); ++ return 0; ++ ++out: ++ dpri("s%d: err %ld\n", bindex, PTR_ERR(br)); ++ return -1; ++} ++ ++void au_dpri_sb(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ aufs_bindex_t bindex; ++ int err; ++ /* to reuduce stack size */ ++ struct { ++ struct vfsmount mnt; ++ struct au_branch fake; ++ } *a; ++ ++ /* this function can be called from magic sysrq */ ++ a = kzalloc(sizeof(*a), GFP_ATOMIC); ++ if (unlikely(!a)) { ++ dpri("no memory\n"); ++ return; ++ } ++ ++ a->mnt.mnt_sb = sb; ++ a->fake.br_path.mnt = &a->mnt; ++ au_br_count_init(&a->fake); ++ err = do_pri_br(-1, &a->fake); ++ au_br_count_fin(&a->fake); ++ kfree(a); ++ dpri("dev 0x%x\n", sb->s_dev); ++ if (err || !au_test_aufs(sb)) ++ return; ++ ++ sbinfo = au_sbi(sb); ++ if (!sbinfo) ++ return; ++ dpri("nw %lld, gen %u, kobj %d\n", ++ percpu_counter_sum(&sbinfo->si_nowait.nw_len), ++ sbinfo->si_generation, ++ atomic_read(&sbinfo->si_kobj.kref.refcount)); ++ for (bindex = 0; bindex <= sbinfo->si_bbot; bindex++) ++ do_pri_br(bindex, sbinfo->si_branch[0 + bindex]); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line) ++{ ++ struct inode *h_inode, *inode = d_inode(dentry); ++ struct dentry *h_dentry; ++ aufs_bindex_t bindex, bbot, bi; ++ ++ if (!inode /* || au_di(dentry)->di_lsc == AuLsc_DI_TMP */) ++ return; ++ ++ bbot = au_dbbot(dentry); ++ bi = au_ibbot(inode); ++ if (bi < bbot) ++ bbot = bi; ++ bindex = au_dbtop(dentry); ++ bi = au_ibtop(inode); ++ if (bi > bindex) ++ bindex = bi; ++ ++ for (; bindex <= bbot; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ h_inode = au_h_iptr(inode, bindex); ++ if (unlikely(h_inode != d_inode(h_dentry))) { ++ au_debug_on(); ++ AuDbg("b%d, %s:%d\n", bindex, func, line); ++ AuDbgDentry(dentry); ++ AuDbgInode(inode); ++ au_debug_off(); ++ BUG(); ++ } ++ } ++} ++ ++void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen) ++{ ++ int err, i, j; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry **dentries; ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ AuDebugOn(err); ++ err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/1); ++ AuDebugOn(err); ++ for (i = dpages.ndpage - 1; !err && i >= 0; i--) { ++ dpage = dpages.dpages + i; ++ dentries = dpage->dentries; ++ for (j = dpage->ndentry - 1; !err && j >= 0; j--) ++ AuDebugOn(au_digen_test(dentries[j], sigen)); ++ } ++ au_dpages_free(&dpages); ++} ++ ++void au_dbg_verify_kthread(void) ++{ ++ if (au_wkq_test()) { ++ au_dbg_blocked(); ++ /* ++ * It may be recursive, but udba=notify between two aufs mounts, ++ * where a single ro branch is shared, is not a problem. ++ */ ++ /* WARN_ON(1); */ ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int __init au_debug_init(void) ++{ ++ aufs_bindex_t bindex; ++ struct au_vdir_destr destr; ++ ++ bindex = -1; ++ AuDebugOn(bindex >= 0); ++ ++ destr.len = -1; ++ AuDebugOn(destr.len < NAME_MAX); ++ ++#ifdef CONFIG_4KSTACKS ++ pr_warn("CONFIG_4KSTACKS is defined.\n"); ++#endif ++ ++ return 0; ++} +diff --git a/fs/aufs/debug.h b/fs/aufs/debug.h +new file mode 100644 +index 0000000..cd5fc3f +--- /dev/null ++++ b/fs/aufs/debug.h +@@ -0,0 +1,225 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * debug print functions ++ */ ++ ++#ifndef __AUFS_DEBUG_H__ ++#define __AUFS_DEBUG_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_AUFS_DEBUG ++#define AuDebugOn(a) BUG_ON(a) ++ ++/* module parameter */ ++extern atomic_t aufs_debug; ++static inline void au_debug_on(void) ++{ ++ atomic_inc(&aufs_debug); ++} ++static inline void au_debug_off(void) ++{ ++ atomic_dec_if_positive(&aufs_debug); ++} ++ ++static inline int au_debug_test(void) ++{ ++ return atomic_read(&aufs_debug) > 0; ++} ++#else ++#define AuDebugOn(a) do {} while (0) ++AuStubVoid(au_debug_on, void) ++AuStubVoid(au_debug_off, void) ++AuStubInt0(au_debug_test, void) ++#endif /* CONFIG_AUFS_DEBUG */ ++ ++#define param_check_atomic_t(name, p) __param_check(name, p, atomic_t) ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* debug print */ ++ ++#define AuDbg(fmt, ...) do { \ ++ if (au_debug_test()) \ ++ pr_debug("DEBUG: " fmt, ##__VA_ARGS__); \ ++} while (0) ++#define AuLabel(l) AuDbg(#l "\n") ++#define AuIOErr(fmt, ...) pr_err("I/O Error, " fmt, ##__VA_ARGS__) ++#define AuWarn1(fmt, ...) do { \ ++ static unsigned char _c; \ ++ if (!_c++) \ ++ pr_warn(fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++#define AuErr1(fmt, ...) do { \ ++ static unsigned char _c; \ ++ if (!_c++) \ ++ pr_err(fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++#define AuIOErr1(fmt, ...) do { \ ++ static unsigned char _c; \ ++ if (!_c++) \ ++ AuIOErr(fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++#define AuUnsupportMsg "This operation is not supported." \ ++ " Please report this application to aufs-users ML." ++#define AuUnsupport(fmt, ...) do { \ ++ pr_err(AuUnsupportMsg "\n" fmt, ##__VA_ARGS__); \ ++ dump_stack(); \ ++} while (0) ++ ++#define AuTraceErr(e) do { \ ++ if (unlikely((e) < 0)) \ ++ AuDbg("err %d\n", (int)(e)); \ ++} while (0) ++ ++#define AuTraceErrPtr(p) do { \ ++ if (IS_ERR(p)) \ ++ AuDbg("err %ld\n", PTR_ERR(p)); \ ++} while (0) ++ ++/* dirty macros for debug print, use with "%.*s" and caution */ ++#define AuLNPair(qstr) (qstr)->len, (qstr)->name ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct dentry; ++#ifdef CONFIG_AUFS_DEBUG ++extern struct mutex au_dbg_mtx; ++extern char *au_plevel; ++struct au_nhash; ++void au_dpri_whlist(struct au_nhash *whlist); ++struct au_vdir; ++void au_dpri_vdir(struct au_vdir *vdir); ++struct inode; ++void au_dpri_inode(struct inode *inode); ++void au_dpri_dalias(struct inode *inode); ++void au_dpri_dentry(struct dentry *dentry); ++struct file; ++void au_dpri_file(struct file *filp); ++struct super_block; ++void au_dpri_sb(struct super_block *sb); ++ ++#define au_dbg_verify_dinode(d) __au_dbg_verify_dinode(d, __func__, __LINE__) ++void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line); ++void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen); ++void au_dbg_verify_kthread(void); ++ ++int __init au_debug_init(void); ++ ++#define AuDbgWhlist(w) do { \ ++ mutex_lock(&au_dbg_mtx); \ ++ AuDbg(#w "\n"); \ ++ au_dpri_whlist(w); \ ++ mutex_unlock(&au_dbg_mtx); \ ++} while (0) ++ ++#define AuDbgVdir(v) do { \ ++ mutex_lock(&au_dbg_mtx); \ ++ AuDbg(#v "\n"); \ ++ au_dpri_vdir(v); \ ++ mutex_unlock(&au_dbg_mtx); \ ++} while (0) ++ ++#define AuDbgInode(i) do { \ ++ mutex_lock(&au_dbg_mtx); \ ++ AuDbg(#i "\n"); \ ++ au_dpri_inode(i); \ ++ mutex_unlock(&au_dbg_mtx); \ ++} while (0) ++ ++#define AuDbgDAlias(i) do { \ ++ mutex_lock(&au_dbg_mtx); \ ++ AuDbg(#i "\n"); \ ++ au_dpri_dalias(i); \ ++ mutex_unlock(&au_dbg_mtx); \ ++} while (0) ++ ++#define AuDbgDentry(d) do { \ ++ mutex_lock(&au_dbg_mtx); \ ++ AuDbg(#d "\n"); \ ++ au_dpri_dentry(d); \ ++ mutex_unlock(&au_dbg_mtx); \ ++} while (0) ++ ++#define AuDbgFile(f) do { \ ++ mutex_lock(&au_dbg_mtx); \ ++ AuDbg(#f "\n"); \ ++ au_dpri_file(f); \ ++ mutex_unlock(&au_dbg_mtx); \ ++} while (0) ++ ++#define AuDbgSb(sb) do { \ ++ mutex_lock(&au_dbg_mtx); \ ++ AuDbg(#sb "\n"); \ ++ au_dpri_sb(sb); \ ++ mutex_unlock(&au_dbg_mtx); \ ++} while (0) ++ ++#define AuDbgSym(addr) do { \ ++ char sym[KSYM_SYMBOL_LEN]; \ ++ sprint_symbol(sym, (unsigned long)addr); \ ++ AuDbg("%s\n", sym); \ ++} while (0) ++#else ++AuStubVoid(au_dbg_verify_dinode, struct dentry *dentry) ++AuStubVoid(au_dbg_verify_gen, struct dentry *parent, unsigned int sigen) ++AuStubVoid(au_dbg_verify_kthread, void) ++AuStubInt0(__init au_debug_init, void) ++ ++#define AuDbgWhlist(w) do {} while (0) ++#define AuDbgVdir(v) do {} while (0) ++#define AuDbgInode(i) do {} while (0) ++#define AuDbgDAlias(i) do {} while (0) ++#define AuDbgDentry(d) do {} while (0) ++#define AuDbgFile(f) do {} while (0) ++#define AuDbgSb(sb) do {} while (0) ++#define AuDbgSym(addr) do {} while (0) ++#endif /* CONFIG_AUFS_DEBUG */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_MAGIC_SYSRQ ++int __init au_sysrq_init(void); ++void au_sysrq_fin(void); ++ ++#ifdef CONFIG_HW_CONSOLE ++#define au_dbg_blocked() do { \ ++ WARN_ON(1); \ ++ handle_sysrq('w'); \ ++} while (0) ++#else ++AuStubVoid(au_dbg_blocked, void) ++#endif ++ ++#else ++AuStubInt0(__init au_sysrq_init, void) ++AuStubVoid(au_sysrq_fin, void) ++AuStubVoid(au_dbg_blocked, void) ++#endif /* CONFIG_AUFS_MAGIC_SYSRQ */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DEBUG_H__ */ +diff --git a/fs/aufs/dentry.c b/fs/aufs/dentry.c +new file mode 100644 +index 0000000..6f6fe25 +--- /dev/null ++++ b/fs/aufs/dentry.c +@@ -0,0 +1,1136 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * lookup and dentry operations ++ */ ++ ++#include ++#include "aufs.h" ++ ++#define AuLkup_ALLOW_NEG 1 ++#define AuLkup_IGNORE_PERM (1 << 1) ++#define au_ftest_lkup(flags, name) ((flags) & AuLkup_##name) ++#define au_fset_lkup(flags, name) \ ++ do { (flags) |= AuLkup_##name; } while (0) ++#define au_fclr_lkup(flags, name) \ ++ do { (flags) &= ~AuLkup_##name; } while (0) ++ ++struct au_do_lookup_args { ++ unsigned int flags; ++ mode_t type; ++}; ++ ++/* ++ * returns positive/negative dentry, NULL or an error. ++ * NULL means whiteout-ed or not-found. ++ */ ++static struct dentry* ++au_do_lookup(struct dentry *h_parent, struct dentry *dentry, ++ aufs_bindex_t bindex, struct qstr *wh_name, ++ struct au_do_lookup_args *args) ++{ ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ struct au_branch *br; ++ int wh_found, opq; ++ unsigned char wh_able; ++ const unsigned char allow_neg = !!au_ftest_lkup(args->flags, ALLOW_NEG); ++ const unsigned char ignore_perm = !!au_ftest_lkup(args->flags, ++ IGNORE_PERM); ++ ++ wh_found = 0; ++ br = au_sbr(dentry->d_sb, bindex); ++ wh_able = !!au_br_whable(br->br_perm); ++ if (wh_able) ++ wh_found = au_wh_test(h_parent, wh_name, /*try_sio*/0); ++ h_dentry = ERR_PTR(wh_found); ++ if (!wh_found) ++ goto real_lookup; ++ if (unlikely(wh_found < 0)) ++ goto out; ++ ++ /* We found a whiteout */ ++ /* au_set_dbbot(dentry, bindex); */ ++ au_set_dbwh(dentry, bindex); ++ if (!allow_neg) ++ return NULL; /* success */ ++ ++real_lookup: ++ if (!ignore_perm) ++ h_dentry = vfsub_lkup_one(&dentry->d_name, h_parent); ++ else ++ h_dentry = au_sio_lkup_one(&dentry->d_name, h_parent); ++ if (IS_ERR(h_dentry)) { ++ if (PTR_ERR(h_dentry) == -ENAMETOOLONG ++ && !allow_neg) ++ h_dentry = NULL; ++ goto out; ++ } ++ ++ h_inode = d_inode(h_dentry); ++ if (d_is_negative(h_dentry)) { ++ if (!allow_neg) ++ goto out_neg; ++ } else if (wh_found ++ || (args->type && args->type != (h_inode->i_mode & S_IFMT))) ++ goto out_neg; ++ ++ if (au_dbbot(dentry) <= bindex) ++ au_set_dbbot(dentry, bindex); ++ if (au_dbtop(dentry) < 0 || bindex < au_dbtop(dentry)) ++ au_set_dbtop(dentry, bindex); ++ au_set_h_dptr(dentry, bindex, h_dentry); ++ ++ if (!d_is_dir(h_dentry) ++ || !wh_able ++ || (d_really_is_positive(dentry) && !d_is_dir(dentry))) ++ goto out; /* success */ ++ ++ inode_lock_nested(h_inode, AuLsc_I_CHILD); ++ opq = au_diropq_test(h_dentry); ++ inode_unlock(h_inode); ++ if (opq > 0) ++ au_set_dbdiropq(dentry, bindex); ++ else if (unlikely(opq < 0)) { ++ au_set_h_dptr(dentry, bindex, NULL); ++ h_dentry = ERR_PTR(opq); ++ } ++ goto out; ++ ++out_neg: ++ dput(h_dentry); ++ h_dentry = NULL; ++out: ++ return h_dentry; ++} ++ ++static int au_test_shwh(struct super_block *sb, const struct qstr *name) ++{ ++ if (unlikely(!au_opt_test(au_mntflags(sb), SHWH) ++ && !strncmp(name->name, AUFS_WH_PFX, AUFS_WH_PFX_LEN))) ++ return -EPERM; ++ return 0; ++} ++ ++/* ++ * returns the number of lower positive dentries, ++ * otherwise an error. ++ * can be called at unlinking with @type is zero. ++ */ ++int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t btop, mode_t type) ++{ ++ int npositive, err; ++ aufs_bindex_t bindex, btail, bdiropq; ++ unsigned char isdir, dirperm1; ++ struct qstr whname; ++ struct au_do_lookup_args args = { ++ .flags = 0, ++ .type = type ++ }; ++ const struct qstr *name = &dentry->d_name; ++ struct dentry *parent; ++ struct super_block *sb; ++ ++ sb = dentry->d_sb; ++ err = au_test_shwh(sb, name); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_wh_name_alloc(&whname, name); ++ if (unlikely(err)) ++ goto out; ++ ++ isdir = !!d_is_dir(dentry); ++ if (!type) ++ au_fset_lkup(args.flags, ALLOW_NEG); ++ dirperm1 = !!au_opt_test(au_mntflags(sb), DIRPERM1); ++ ++ npositive = 0; ++ parent = dget_parent(dentry); ++ btail = au_dbtaildir(parent); ++ for (bindex = btop; bindex <= btail; bindex++) { ++ struct dentry *h_parent, *h_dentry; ++ struct inode *h_inode, *h_dir; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry) { ++ if (d_is_positive(h_dentry)) ++ npositive++; ++ if (type != S_IFDIR) ++ break; ++ continue; ++ } ++ h_parent = au_h_dptr(parent, bindex); ++ if (!h_parent || !d_is_dir(h_parent)) ++ continue; ++ ++ h_dir = d_inode(h_parent); ++ inode_lock_nested(h_dir, AuLsc_I_PARENT); ++ h_dentry = au_do_lookup(h_parent, dentry, bindex, &whname, ++ &args); ++ inode_unlock(h_dir); ++ err = PTR_ERR(h_dentry); ++ if (IS_ERR(h_dentry)) ++ goto out_parent; ++ if (h_dentry) ++ au_fclr_lkup(args.flags, ALLOW_NEG); ++ if (dirperm1) ++ au_fset_lkup(args.flags, IGNORE_PERM); ++ ++ if (au_dbwh(dentry) == bindex) ++ break; ++ if (!h_dentry) ++ continue; ++ if (d_is_negative(h_dentry)) ++ continue; ++ h_inode = d_inode(h_dentry); ++ npositive++; ++ if (!args.type) ++ args.type = h_inode->i_mode & S_IFMT; ++ if (args.type != S_IFDIR) ++ break; ++ else if (isdir) { ++ /* the type of lower may be different */ ++ bdiropq = au_dbdiropq(dentry); ++ if (bdiropq >= 0 && bdiropq <= bindex) ++ break; ++ } ++ } ++ ++ if (npositive) { ++ AuLabel(positive); ++ au_update_dbtop(dentry); ++ } ++ err = npositive; ++ if (unlikely(!au_opt_test(au_mntflags(sb), UDBA_NONE) ++ && au_dbtop(dentry) < 0)) { ++ err = -EIO; ++ AuIOErr("both of real entry and whiteout found, %pd, err %d\n", ++ dentry, err); ++ } ++ ++out_parent: ++ dput(parent); ++ kfree(whname.name); ++out: ++ return err; ++} ++ ++struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent) ++{ ++ struct dentry *dentry; ++ int wkq_err; ++ ++ if (!au_test_h_perm_sio(d_inode(parent), MAY_EXEC)) ++ dentry = vfsub_lkup_one(name, parent); ++ else { ++ struct vfsub_lkup_one_args args = { ++ .errp = &dentry, ++ .name = name, ++ .parent = parent ++ }; ++ ++ wkq_err = au_wkq_wait(vfsub_call_lkup_one, &args); ++ if (unlikely(wkq_err)) ++ dentry = ERR_PTR(wkq_err); ++ } ++ ++ return dentry; ++} ++ ++/* ++ * lookup @dentry on @bindex which should be negative. ++ */ ++int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex, int wh) ++{ ++ int err; ++ struct dentry *parent, *h_parent, *h_dentry; ++ struct au_branch *br; ++ ++ parent = dget_parent(dentry); ++ h_parent = au_h_dptr(parent, bindex); ++ br = au_sbr(dentry->d_sb, bindex); ++ if (wh) ++ h_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name); ++ else ++ h_dentry = au_sio_lkup_one(&dentry->d_name, h_parent); ++ err = PTR_ERR(h_dentry); ++ if (IS_ERR(h_dentry)) ++ goto out; ++ if (unlikely(d_is_positive(h_dentry))) { ++ err = -EIO; ++ AuIOErr("%pd should be negative on b%d.\n", h_dentry, bindex); ++ dput(h_dentry); ++ goto out; ++ } ++ ++ err = 0; ++ if (bindex < au_dbtop(dentry)) ++ au_set_dbtop(dentry, bindex); ++ if (au_dbbot(dentry) < bindex) ++ au_set_dbbot(dentry, bindex); ++ au_set_h_dptr(dentry, bindex, h_dentry); ++ ++out: ++ dput(parent); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* subset of struct inode */ ++struct au_iattr { ++ unsigned long i_ino; ++ /* unsigned int i_nlink; */ ++ kuid_t i_uid; ++ kgid_t i_gid; ++ u64 i_version; ++/* ++ loff_t i_size; ++ blkcnt_t i_blocks; ++*/ ++ umode_t i_mode; ++}; ++ ++static void au_iattr_save(struct au_iattr *ia, struct inode *h_inode) ++{ ++ ia->i_ino = h_inode->i_ino; ++ /* ia->i_nlink = h_inode->i_nlink; */ ++ ia->i_uid = h_inode->i_uid; ++ ia->i_gid = h_inode->i_gid; ++ ia->i_version = h_inode->i_version; ++/* ++ ia->i_size = h_inode->i_size; ++ ia->i_blocks = h_inode->i_blocks; ++*/ ++ ia->i_mode = (h_inode->i_mode & S_IFMT); ++} ++ ++static int au_iattr_test(struct au_iattr *ia, struct inode *h_inode) ++{ ++ return ia->i_ino != h_inode->i_ino ++ /* || ia->i_nlink != h_inode->i_nlink */ ++ || !uid_eq(ia->i_uid, h_inode->i_uid) ++ || !gid_eq(ia->i_gid, h_inode->i_gid) ++ || ia->i_version != h_inode->i_version ++/* ++ || ia->i_size != h_inode->i_size ++ || ia->i_blocks != h_inode->i_blocks ++*/ ++ || ia->i_mode != (h_inode->i_mode & S_IFMT); ++} ++ ++static int au_h_verify_dentry(struct dentry *h_dentry, struct dentry *h_parent, ++ struct au_branch *br) ++{ ++ int err; ++ struct au_iattr ia; ++ struct inode *h_inode; ++ struct dentry *h_d; ++ struct super_block *h_sb; ++ ++ err = 0; ++ memset(&ia, -1, sizeof(ia)); ++ h_sb = h_dentry->d_sb; ++ h_inode = NULL; ++ if (d_is_positive(h_dentry)) { ++ h_inode = d_inode(h_dentry); ++ au_iattr_save(&ia, h_inode); ++ } else if (au_test_nfs(h_sb) || au_test_fuse(h_sb)) ++ /* nfs d_revalidate may return 0 for negative dentry */ ++ /* fuse d_revalidate always return 0 for negative dentry */ ++ goto out; ++ ++ /* main purpose is namei.c:cached_lookup() and d_revalidate */ ++ h_d = vfsub_lkup_one(&h_dentry->d_name, h_parent); ++ err = PTR_ERR(h_d); ++ if (IS_ERR(h_d)) ++ goto out; ++ ++ err = 0; ++ if (unlikely(h_d != h_dentry ++ || d_inode(h_d) != h_inode ++ || (h_inode && au_iattr_test(&ia, h_inode)))) ++ err = au_busy_or_stale(); ++ dput(h_d); ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir, ++ struct dentry *h_parent, struct au_branch *br) ++{ ++ int err; ++ ++ err = 0; ++ if (udba == AuOpt_UDBA_REVAL ++ && !au_test_fs_remote(h_dentry->d_sb)) { ++ IMustLock(h_dir); ++ err = (d_inode(h_dentry->d_parent) != h_dir); ++ } else if (udba != AuOpt_UDBA_NONE) ++ err = au_h_verify_dentry(h_dentry, h_parent, br); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_do_refresh_hdentry(struct dentry *dentry, struct dentry *parent) ++{ ++ int err; ++ aufs_bindex_t new_bindex, bindex, bbot, bwh, bdiropq; ++ struct au_hdentry tmp, *p, *q; ++ struct au_dinfo *dinfo; ++ struct super_block *sb; ++ ++ DiMustWriteLock(dentry); ++ ++ sb = dentry->d_sb; ++ dinfo = au_di(dentry); ++ bbot = dinfo->di_bbot; ++ bwh = dinfo->di_bwh; ++ bdiropq = dinfo->di_bdiropq; ++ p = dinfo->di_hdentry + dinfo->di_btop; ++ for (bindex = dinfo->di_btop; bindex <= bbot; bindex++, p++) { ++ if (!p->hd_dentry) ++ continue; ++ ++ new_bindex = au_br_index(sb, p->hd_id); ++ if (new_bindex == bindex) ++ continue; ++ ++ if (dinfo->di_bwh == bindex) ++ bwh = new_bindex; ++ if (dinfo->di_bdiropq == bindex) ++ bdiropq = new_bindex; ++ if (new_bindex < 0) { ++ au_hdput(p); ++ p->hd_dentry = NULL; ++ continue; ++ } ++ ++ /* swap two lower dentries, and loop again */ ++ q = dinfo->di_hdentry + new_bindex; ++ tmp = *q; ++ *q = *p; ++ *p = tmp; ++ if (tmp.hd_dentry) { ++ bindex--; ++ p--; ++ } ++ } ++ ++ dinfo->di_bwh = -1; ++ if (bwh >= 0 && bwh <= au_sbbot(sb) && au_sbr_whable(sb, bwh)) ++ dinfo->di_bwh = bwh; ++ ++ dinfo->di_bdiropq = -1; ++ if (bdiropq >= 0 ++ && bdiropq <= au_sbbot(sb) ++ && au_sbr_whable(sb, bdiropq)) ++ dinfo->di_bdiropq = bdiropq; ++ ++ err = -EIO; ++ dinfo->di_btop = -1; ++ dinfo->di_bbot = -1; ++ bbot = au_dbbot(parent); ++ p = dinfo->di_hdentry; ++ for (bindex = 0; bindex <= bbot; bindex++, p++) ++ if (p->hd_dentry) { ++ dinfo->di_btop = bindex; ++ break; ++ } ++ ++ if (dinfo->di_btop >= 0) { ++ p = dinfo->di_hdentry + bbot; ++ for (bindex = bbot; bindex >= 0; bindex--, p--) ++ if (p->hd_dentry) { ++ dinfo->di_bbot = bindex; ++ err = 0; ++ break; ++ } ++ } ++ ++ return err; ++} ++ ++static void au_do_hide(struct dentry *dentry) ++{ ++ struct inode *inode; ++ ++ if (d_really_is_positive(dentry)) { ++ inode = d_inode(dentry); ++ if (!d_is_dir(dentry)) { ++ if (inode->i_nlink && !d_unhashed(dentry)) ++ drop_nlink(inode); ++ } else { ++ clear_nlink(inode); ++ /* stop next lookup */ ++ inode->i_flags |= S_DEAD; ++ } ++ smp_mb(); /* necessary? */ ++ } ++ d_drop(dentry); ++} ++ ++static int au_hide_children(struct dentry *parent) ++{ ++ int err, i, j, ndentry; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry *dentry; ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_dcsub_pages(&dpages, parent, NULL, NULL); ++ if (unlikely(err)) ++ goto out_dpages; ++ ++ /* in reverse order */ ++ for (i = dpages.ndpage - 1; i >= 0; i--) { ++ dpage = dpages.dpages + i; ++ ndentry = dpage->ndentry; ++ for (j = ndentry - 1; j >= 0; j--) { ++ dentry = dpage->dentries[j]; ++ if (dentry != parent) ++ au_do_hide(dentry); ++ } ++ } ++ ++out_dpages: ++ au_dpages_free(&dpages); ++out: ++ return err; ++} ++ ++static void au_hide(struct dentry *dentry) ++{ ++ int err; ++ ++ AuDbgDentry(dentry); ++ if (d_is_dir(dentry)) { ++ /* shrink_dcache_parent(dentry); */ ++ err = au_hide_children(dentry); ++ if (unlikely(err)) ++ AuIOErr("%pd, failed hiding children, ignored %d\n", ++ dentry, err); ++ } ++ au_do_hide(dentry); ++} ++ ++/* ++ * By adding a dirty branch, a cached dentry may be affected in various ways. ++ * ++ * a dirty branch is added ++ * - on the top of layers ++ * - in the middle of layers ++ * - to the bottom of layers ++ * ++ * on the added branch there exists ++ * - a whiteout ++ * - a diropq ++ * - a same named entry ++ * + exist ++ * * negative --> positive ++ * * positive --> positive ++ * - type is unchanged ++ * - type is changed ++ * + doesn't exist ++ * * negative --> negative ++ * * positive --> negative (rejected by au_br_del() for non-dir case) ++ * - none ++ */ ++static int au_refresh_by_dinfo(struct dentry *dentry, struct au_dinfo *dinfo, ++ struct au_dinfo *tmp) ++{ ++ int err; ++ aufs_bindex_t bindex, bbot; ++ struct { ++ struct dentry *dentry; ++ struct inode *inode; ++ mode_t mode; ++ } orig_h, tmp_h = { ++ .dentry = NULL ++ }; ++ struct au_hdentry *hd; ++ struct inode *inode, *h_inode; ++ struct dentry *h_dentry; ++ ++ err = 0; ++ AuDebugOn(dinfo->di_btop < 0); ++ orig_h.mode = 0; ++ orig_h.dentry = dinfo->di_hdentry[dinfo->di_btop].hd_dentry; ++ orig_h.inode = NULL; ++ if (d_is_positive(orig_h.dentry)) { ++ orig_h.inode = d_inode(orig_h.dentry); ++ orig_h.mode = orig_h.inode->i_mode & S_IFMT; ++ } ++ if (tmp->di_btop >= 0) { ++ tmp_h.dentry = tmp->di_hdentry[tmp->di_btop].hd_dentry; ++ if (d_is_positive(tmp_h.dentry)) { ++ tmp_h.inode = d_inode(tmp_h.dentry); ++ tmp_h.mode = tmp_h.inode->i_mode & S_IFMT; ++ } ++ } ++ ++ inode = NULL; ++ if (d_really_is_positive(dentry)) ++ inode = d_inode(dentry); ++ if (!orig_h.inode) { ++ AuDbg("nagative originally\n"); ++ if (inode) { ++ au_hide(dentry); ++ goto out; ++ } ++ AuDebugOn(inode); ++ AuDebugOn(dinfo->di_btop != dinfo->di_bbot); ++ AuDebugOn(dinfo->di_bdiropq != -1); ++ ++ if (!tmp_h.inode) { ++ AuDbg("negative --> negative\n"); ++ /* should have only one negative lower */ ++ if (tmp->di_btop >= 0 ++ && tmp->di_btop < dinfo->di_btop) { ++ AuDebugOn(tmp->di_btop != tmp->di_bbot); ++ AuDebugOn(dinfo->di_btop != dinfo->di_bbot); ++ au_set_h_dptr(dentry, dinfo->di_btop, NULL); ++ au_di_cp(dinfo, tmp); ++ hd = tmp->di_hdentry + tmp->di_btop; ++ au_set_h_dptr(dentry, tmp->di_btop, ++ dget(hd->hd_dentry)); ++ } ++ au_dbg_verify_dinode(dentry); ++ } else { ++ AuDbg("negative --> positive\n"); ++ /* ++ * similar to the behaviour of creating with bypassing ++ * aufs. ++ * unhash it in order to force an error in the ++ * succeeding create operation. ++ * we should not set S_DEAD here. ++ */ ++ d_drop(dentry); ++ /* au_di_swap(tmp, dinfo); */ ++ au_dbg_verify_dinode(dentry); ++ } ++ } else { ++ AuDbg("positive originally\n"); ++ /* inode may be NULL */ ++ AuDebugOn(inode && (inode->i_mode & S_IFMT) != orig_h.mode); ++ if (!tmp_h.inode) { ++ AuDbg("positive --> negative\n"); ++ /* or bypassing aufs */ ++ au_hide(dentry); ++ if (tmp->di_bwh >= 0 && tmp->di_bwh <= dinfo->di_btop) ++ dinfo->di_bwh = tmp->di_bwh; ++ if (inode) ++ err = au_refresh_hinode_self(inode); ++ au_dbg_verify_dinode(dentry); ++ } else if (orig_h.mode == tmp_h.mode) { ++ AuDbg("positive --> positive, same type\n"); ++ if (!S_ISDIR(orig_h.mode) ++ && dinfo->di_btop > tmp->di_btop) { ++ /* ++ * similar to the behaviour of removing and ++ * creating. ++ */ ++ au_hide(dentry); ++ if (inode) ++ err = au_refresh_hinode_self(inode); ++ au_dbg_verify_dinode(dentry); ++ } else { ++ /* fill empty slots */ ++ if (dinfo->di_btop > tmp->di_btop) ++ dinfo->di_btop = tmp->di_btop; ++ if (dinfo->di_bbot < tmp->di_bbot) ++ dinfo->di_bbot = tmp->di_bbot; ++ dinfo->di_bwh = tmp->di_bwh; ++ dinfo->di_bdiropq = tmp->di_bdiropq; ++ hd = tmp->di_hdentry; ++ bbot = dinfo->di_bbot; ++ for (bindex = tmp->di_btop; bindex <= bbot; ++ bindex++) { ++ if (au_h_dptr(dentry, bindex)) ++ continue; ++ h_dentry = hd[bindex].hd_dentry; ++ if (!h_dentry) ++ continue; ++ AuDebugOn(d_is_negative(h_dentry)); ++ h_inode = d_inode(h_dentry); ++ AuDebugOn(orig_h.mode ++ != (h_inode->i_mode ++ & S_IFMT)); ++ au_set_h_dptr(dentry, bindex, ++ dget(h_dentry)); ++ } ++ err = au_refresh_hinode(inode, dentry); ++ au_dbg_verify_dinode(dentry); ++ } ++ } else { ++ AuDbg("positive --> positive, different type\n"); ++ /* similar to the behaviour of removing and creating */ ++ au_hide(dentry); ++ if (inode) ++ err = au_refresh_hinode_self(inode); ++ au_dbg_verify_dinode(dentry); ++ } ++ } ++ ++out: ++ return err; ++} ++ ++void au_refresh_dop(struct dentry *dentry, int force_reval) ++{ ++ const struct dentry_operations *dop ++ = force_reval ? &aufs_dop : dentry->d_sb->s_d_op; ++ static const unsigned int mask ++ = DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE; ++ ++ BUILD_BUG_ON(sizeof(mask) != sizeof(dentry->d_flags)); ++ ++ if (dentry->d_op == dop) ++ return; ++ ++ AuDbg("%pd\n", dentry); ++ spin_lock(&dentry->d_lock); ++ if (dop == &aufs_dop) ++ dentry->d_flags |= mask; ++ else ++ dentry->d_flags &= ~mask; ++ dentry->d_op = dop; ++ spin_unlock(&dentry->d_lock); ++} ++ ++int au_refresh_dentry(struct dentry *dentry, struct dentry *parent) ++{ ++ int err, ebrange; ++ unsigned int sigen; ++ struct au_dinfo *dinfo, *tmp; ++ struct super_block *sb; ++ struct inode *inode; ++ ++ DiMustWriteLock(dentry); ++ AuDebugOn(IS_ROOT(dentry)); ++ AuDebugOn(d_really_is_negative(parent)); ++ ++ sb = dentry->d_sb; ++ sigen = au_sigen(sb); ++ err = au_digen_test(parent, sigen); ++ if (unlikely(err)) ++ goto out; ++ ++ dinfo = au_di(dentry); ++ err = au_di_realloc(dinfo, au_sbbot(sb) + 1); ++ if (unlikely(err)) ++ goto out; ++ ebrange = au_dbrange_test(dentry); ++ if (!ebrange) ++ ebrange = au_do_refresh_hdentry(dentry, parent); ++ ++ if (d_unhashed(dentry) || ebrange /* || dinfo->di_tmpfile */) { ++ AuDebugOn(au_dbtop(dentry) < 0 && au_dbbot(dentry) >= 0); ++ if (d_really_is_positive(dentry)) { ++ inode = d_inode(dentry); ++ err = au_refresh_hinode_self(inode); ++ } ++ au_dbg_verify_dinode(dentry); ++ if (!err) ++ goto out_dgen; /* success */ ++ goto out; ++ } ++ ++ /* temporary dinfo */ ++ AuDbgDentry(dentry); ++ err = -ENOMEM; ++ tmp = au_di_alloc(sb, AuLsc_DI_TMP); ++ if (unlikely(!tmp)) ++ goto out; ++ au_di_swap(tmp, dinfo); ++ /* returns the number of positive dentries */ ++ /* ++ * if current working dir is removed, it returns an error. ++ * but the dentry is legal. ++ */ ++ err = au_lkup_dentry(dentry, /*btop*/0, /*type*/0); ++ AuDbgDentry(dentry); ++ au_di_swap(tmp, dinfo); ++ if (err == -ENOENT) ++ err = 0; ++ if (err >= 0) { ++ /* compare/refresh by dinfo */ ++ AuDbgDentry(dentry); ++ err = au_refresh_by_dinfo(dentry, dinfo, tmp); ++ au_dbg_verify_dinode(dentry); ++ AuTraceErr(err); ++ } ++ au_rw_write_unlock(&tmp->di_rwsem); ++ au_di_free(tmp); ++ if (unlikely(err)) ++ goto out; ++ ++out_dgen: ++ au_update_digen(dentry); ++out: ++ if (unlikely(err && !(dentry->d_flags & DCACHE_NFSFS_RENAMED))) { ++ AuIOErr("failed refreshing %pd, %d\n", dentry, err); ++ AuDbgDentry(dentry); ++ } ++ AuTraceErr(err); ++ return err; ++} ++ ++static int au_do_h_d_reval(struct dentry *h_dentry, unsigned int flags, ++ struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ int err, valid; ++ ++ err = 0; ++ if (!(h_dentry->d_flags & DCACHE_OP_REVALIDATE)) ++ goto out; ++ ++ AuDbg("b%d\n", bindex); ++ /* ++ * gave up supporting LOOKUP_CREATE/OPEN for lower fs, ++ * due to whiteout and branch permission. ++ */ ++ flags &= ~(/*LOOKUP_PARENT |*/ LOOKUP_OPEN | LOOKUP_CREATE ++ | LOOKUP_FOLLOW | LOOKUP_EXCL); ++ /* it may return tri-state */ ++ valid = h_dentry->d_op->d_revalidate(h_dentry, flags); ++ ++ if (unlikely(valid < 0)) ++ err = valid; ++ else if (!valid) ++ err = -EINVAL; ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* todo: remove this */ ++static int h_d_revalidate(struct dentry *dentry, struct inode *inode, ++ unsigned int flags, int do_udba) ++{ ++ int err; ++ umode_t mode, h_mode; ++ aufs_bindex_t bindex, btail, btop, ibs, ibe; ++ unsigned char plus, unhashed, is_root, h_plus, h_nfs, tmpfile; ++ struct inode *h_inode, *h_cached_inode; ++ struct dentry *h_dentry; ++ struct qstr *name, *h_name; ++ ++ err = 0; ++ plus = 0; ++ mode = 0; ++ ibs = -1; ++ ibe = -1; ++ unhashed = !!d_unhashed(dentry); ++ is_root = !!IS_ROOT(dentry); ++ name = &dentry->d_name; ++ tmpfile = au_di(dentry)->di_tmpfile; ++ ++ /* ++ * Theoretically, REVAL test should be unnecessary in case of ++ * {FS,I}NOTIFY. ++ * But {fs,i}notify doesn't fire some necessary events, ++ * IN_ATTRIB for atime/nlink/pageio ++ * Let's do REVAL test too. ++ */ ++ if (do_udba && inode) { ++ mode = (inode->i_mode & S_IFMT); ++ plus = (inode->i_nlink > 0); ++ ibs = au_ibtop(inode); ++ ibe = au_ibbot(inode); ++ } ++ ++ btop = au_dbtop(dentry); ++ btail = btop; ++ if (inode && S_ISDIR(inode->i_mode)) ++ btail = au_dbtaildir(dentry); ++ for (bindex = btop; bindex <= btail; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ ++ AuDbg("b%d, %pd\n", bindex, h_dentry); ++ h_nfs = !!au_test_nfs(h_dentry->d_sb); ++ spin_lock(&h_dentry->d_lock); ++ h_name = &h_dentry->d_name; ++ if (unlikely(do_udba ++ && !is_root ++ && ((!h_nfs ++ && (unhashed != !!d_unhashed(h_dentry) ++ || (!tmpfile ++ && !au_qstreq(name, h_name)) ++ )) ++ || (h_nfs ++ && !(flags & LOOKUP_OPEN) ++ && (h_dentry->d_flags ++ & DCACHE_NFSFS_RENAMED))) ++ )) { ++ int h_unhashed; ++ ++ h_unhashed = d_unhashed(h_dentry); ++ spin_unlock(&h_dentry->d_lock); ++ AuDbg("unhash 0x%x 0x%x, %pd %pd\n", ++ unhashed, h_unhashed, dentry, h_dentry); ++ goto err; ++ } ++ spin_unlock(&h_dentry->d_lock); ++ ++ err = au_do_h_d_reval(h_dentry, flags, dentry, bindex); ++ if (unlikely(err)) ++ /* do not goto err, to keep the errno */ ++ break; ++ ++ /* todo: plink too? */ ++ if (!do_udba) ++ continue; ++ ++ /* UDBA tests */ ++ if (unlikely(!!inode != d_is_positive(h_dentry))) ++ goto err; ++ ++ h_inode = NULL; ++ if (d_is_positive(h_dentry)) ++ h_inode = d_inode(h_dentry); ++ h_plus = plus; ++ h_mode = mode; ++ h_cached_inode = h_inode; ++ if (h_inode) { ++ h_mode = (h_inode->i_mode & S_IFMT); ++ h_plus = (h_inode->i_nlink > 0); ++ } ++ if (inode && ibs <= bindex && bindex <= ibe) ++ h_cached_inode = au_h_iptr(inode, bindex); ++ ++ if (!h_nfs) { ++ if (unlikely(plus != h_plus && !tmpfile)) ++ goto err; ++ } else { ++ if (unlikely(!(h_dentry->d_flags & DCACHE_NFSFS_RENAMED) ++ && !is_root ++ && !IS_ROOT(h_dentry) ++ && unhashed != d_unhashed(h_dentry))) ++ goto err; ++ } ++ if (unlikely(mode != h_mode ++ || h_cached_inode != h_inode)) ++ goto err; ++ continue; ++ ++err: ++ err = -EINVAL; ++ break; ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++/* todo: consolidate with do_refresh() and au_reval_for_attr() */ ++static int simple_reval_dpath(struct dentry *dentry, unsigned int sigen) ++{ ++ int err; ++ struct dentry *parent; ++ ++ if (!au_digen_test(dentry, sigen)) ++ return 0; ++ ++ parent = dget_parent(dentry); ++ di_read_lock_parent(parent, AuLock_IR); ++ AuDebugOn(au_digen_test(parent, sigen)); ++ au_dbg_verify_gen(parent, sigen); ++ err = au_refresh_dentry(dentry, parent); ++ di_read_unlock(parent, AuLock_IR); ++ dput(parent); ++ AuTraceErr(err); ++ return err; ++} ++ ++int au_reval_dpath(struct dentry *dentry, unsigned int sigen) ++{ ++ int err; ++ struct dentry *d, *parent; ++ ++ if (!au_ftest_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR)) ++ return simple_reval_dpath(dentry, sigen); ++ ++ /* slow loop, keep it simple and stupid */ ++ /* cf: au_cpup_dirs() */ ++ err = 0; ++ parent = NULL; ++ while (au_digen_test(dentry, sigen)) { ++ d = dentry; ++ while (1) { ++ dput(parent); ++ parent = dget_parent(d); ++ if (!au_digen_test(parent, sigen)) ++ break; ++ d = parent; ++ } ++ ++ if (d != dentry) ++ di_write_lock_child2(d); ++ ++ /* someone might update our dentry while we were sleeping */ ++ if (au_digen_test(d, sigen)) { ++ /* ++ * todo: consolidate with simple_reval_dpath(), ++ * do_refresh() and au_reval_for_attr(). ++ */ ++ di_read_lock_parent(parent, AuLock_IR); ++ err = au_refresh_dentry(d, parent); ++ di_read_unlock(parent, AuLock_IR); ++ } ++ ++ if (d != dentry) ++ di_write_unlock(d); ++ dput(parent); ++ if (unlikely(err)) ++ break; ++ } ++ ++ return err; ++} ++ ++/* ++ * if valid returns 1, otherwise 0. ++ */ ++static int aufs_d_revalidate(struct dentry *dentry, unsigned int flags) ++{ ++ int valid, err; ++ unsigned int sigen; ++ unsigned char do_udba; ++ struct super_block *sb; ++ struct inode *inode; ++ ++ /* todo: support rcu-walk? */ ++ if (flags & LOOKUP_RCU) ++ return -ECHILD; ++ ++ valid = 0; ++ if (unlikely(!au_di(dentry))) ++ goto out; ++ ++ valid = 1; ++ sb = dentry->d_sb; ++ /* ++ * todo: very ugly ++ * i_mutex of parent dir may be held, ++ * but we should not return 'invalid' due to busy. ++ */ ++ err = aufs_read_lock(dentry, AuLock_FLUSH | AuLock_DW | AuLock_NOPLM); ++ if (unlikely(err)) { ++ valid = err; ++ AuTraceErr(err); ++ goto out; ++ } ++ inode = NULL; ++ if (d_really_is_positive(dentry)) ++ inode = d_inode(dentry); ++ if (unlikely(inode && is_bad_inode(inode))) { ++ err = -EINVAL; ++ AuTraceErr(err); ++ goto out_dgrade; ++ } ++ if (unlikely(au_dbrange_test(dentry))) { ++ err = -EINVAL; ++ AuTraceErr(err); ++ goto out_dgrade; ++ } ++ ++ sigen = au_sigen(sb); ++ if (au_digen_test(dentry, sigen)) { ++ AuDebugOn(IS_ROOT(dentry)); ++ err = au_reval_dpath(dentry, sigen); ++ if (unlikely(err)) { ++ AuTraceErr(err); ++ goto out_dgrade; ++ } ++ } ++ di_downgrade_lock(dentry, AuLock_IR); ++ ++ err = -EINVAL; ++ if (!(flags & (LOOKUP_OPEN | LOOKUP_EMPTY)) ++ && inode ++ && !(inode->i_state && I_LINKABLE) ++ && (IS_DEADDIR(inode) || !inode->i_nlink)) { ++ AuTraceErr(err); ++ goto out_inval; ++ } ++ ++ do_udba = !au_opt_test(au_mntflags(sb), UDBA_NONE); ++ if (do_udba && inode) { ++ aufs_bindex_t btop = au_ibtop(inode); ++ struct inode *h_inode; ++ ++ if (btop >= 0) { ++ h_inode = au_h_iptr(inode, btop); ++ if (h_inode && au_test_higen(inode, h_inode)) { ++ AuTraceErr(err); ++ goto out_inval; ++ } ++ } ++ } ++ ++ err = h_d_revalidate(dentry, inode, flags, do_udba); ++ if (unlikely(!err && do_udba && au_dbtop(dentry) < 0)) { ++ err = -EIO; ++ AuDbg("both of real entry and whiteout found, %p, err %d\n", ++ dentry, err); ++ } ++ goto out_inval; ++ ++out_dgrade: ++ di_downgrade_lock(dentry, AuLock_IR); ++out_inval: ++ aufs_read_unlock(dentry, AuLock_IR); ++ AuTraceErr(err); ++ valid = !err; ++out: ++ if (!valid) { ++ AuDbg("%pd invalid, %d\n", dentry, valid); ++ d_drop(dentry); ++ } ++ return valid; ++} ++ ++static void aufs_d_release(struct dentry *dentry) ++{ ++ if (au_di(dentry)) { ++ au_di_fin(dentry); ++ au_hn_di_reinit(dentry); ++ } ++} ++ ++const struct dentry_operations aufs_dop = { ++ .d_revalidate = aufs_d_revalidate, ++ .d_weak_revalidate = aufs_d_revalidate, ++ .d_release = aufs_d_release ++}; ++ ++/* aufs_dop without d_revalidate */ ++const struct dentry_operations aufs_dop_noreval = { ++ .d_release = aufs_d_release ++}; +diff --git a/fs/aufs/dentry.h b/fs/aufs/dentry.h +new file mode 100644 +index 0000000..238909f +--- /dev/null ++++ b/fs/aufs/dentry.h +@@ -0,0 +1,234 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * lookup and dentry operations ++ */ ++ ++#ifndef __AUFS_DENTRY_H__ ++#define __AUFS_DENTRY_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include "rwsem.h" ++ ++struct au_hdentry { ++ struct dentry *hd_dentry; ++ aufs_bindex_t hd_id; ++}; ++ ++struct au_dinfo { ++ atomic_t di_generation; ++ ++ struct au_rwsem di_rwsem; ++ aufs_bindex_t di_btop, di_bbot, di_bwh, di_bdiropq; ++ unsigned char di_tmpfile; /* to allow the different name */ ++ struct au_hdentry *di_hdentry; ++} ____cacheline_aligned_in_smp; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* dentry.c */ ++extern const struct dentry_operations aufs_dop, aufs_dop_noreval; ++struct au_branch; ++struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent); ++int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir, ++ struct dentry *h_parent, struct au_branch *br); ++ ++int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t btop, mode_t type); ++int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex, int wh); ++int au_refresh_dentry(struct dentry *dentry, struct dentry *parent); ++int au_reval_dpath(struct dentry *dentry, unsigned int sigen); ++void au_refresh_dop(struct dentry *dentry, int force_reval); ++ ++/* dinfo.c */ ++void au_di_init_once(void *_di); ++struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc); ++void au_di_free(struct au_dinfo *dinfo); ++void au_di_swap(struct au_dinfo *a, struct au_dinfo *b); ++void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src); ++int au_di_init(struct dentry *dentry); ++void au_di_fin(struct dentry *dentry); ++int au_di_realloc(struct au_dinfo *dinfo, int nbr); ++ ++void di_read_lock(struct dentry *d, int flags, unsigned int lsc); ++void di_read_unlock(struct dentry *d, int flags); ++void di_downgrade_lock(struct dentry *d, int flags); ++void di_write_lock(struct dentry *d, unsigned int lsc); ++void di_write_unlock(struct dentry *d); ++void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir); ++void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir); ++void di_write_unlock2(struct dentry *d1, struct dentry *d2); ++ ++struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex); ++struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex); ++aufs_bindex_t au_dbtail(struct dentry *dentry); ++aufs_bindex_t au_dbtaildir(struct dentry *dentry); ++ ++void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_dentry); ++int au_digen_test(struct dentry *dentry, unsigned int sigen); ++int au_dbrange_test(struct dentry *dentry); ++void au_update_digen(struct dentry *dentry); ++void au_update_dbrange(struct dentry *dentry, int do_put_zero); ++void au_update_dbtop(struct dentry *dentry); ++void au_update_dbbot(struct dentry *dentry); ++int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct au_dinfo *au_di(struct dentry *dentry) ++{ ++ return dentry->d_fsdata; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* lock subclass for dinfo */ ++enum { ++ AuLsc_DI_CHILD, /* child first */ ++ AuLsc_DI_CHILD2, /* rename(2), link(2), and cpup at hnotify */ ++ AuLsc_DI_CHILD3, /* copyup dirs */ ++ AuLsc_DI_PARENT, ++ AuLsc_DI_PARENT2, ++ AuLsc_DI_PARENT3, ++ AuLsc_DI_TMP /* temp for replacing dinfo */ ++}; ++ ++/* ++ * di_read_lock_child, di_write_lock_child, ++ * di_read_lock_child2, di_write_lock_child2, ++ * di_read_lock_child3, di_write_lock_child3, ++ * di_read_lock_parent, di_write_lock_parent, ++ * di_read_lock_parent2, di_write_lock_parent2, ++ * di_read_lock_parent3, di_write_lock_parent3, ++ */ ++#define AuReadLockFunc(name, lsc) \ ++static inline void di_read_lock_##name(struct dentry *d, int flags) \ ++{ di_read_lock(d, flags, AuLsc_DI_##lsc); } ++ ++#define AuWriteLockFunc(name, lsc) \ ++static inline void di_write_lock_##name(struct dentry *d) \ ++{ di_write_lock(d, AuLsc_DI_##lsc); } ++ ++#define AuRWLockFuncs(name, lsc) \ ++ AuReadLockFunc(name, lsc) \ ++ AuWriteLockFunc(name, lsc) ++ ++AuRWLockFuncs(child, CHILD); ++AuRWLockFuncs(child2, CHILD2); ++AuRWLockFuncs(child3, CHILD3); ++AuRWLockFuncs(parent, PARENT); ++AuRWLockFuncs(parent2, PARENT2); ++AuRWLockFuncs(parent3, PARENT3); ++ ++#undef AuReadLockFunc ++#undef AuWriteLockFunc ++#undef AuRWLockFuncs ++ ++#define DiMustNoWaiters(d) AuRwMustNoWaiters(&au_di(d)->di_rwsem) ++#define DiMustAnyLock(d) AuRwMustAnyLock(&au_di(d)->di_rwsem) ++#define DiMustWriteLock(d) AuRwMustWriteLock(&au_di(d)->di_rwsem) ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* todo: memory barrier? */ ++static inline unsigned int au_digen(struct dentry *d) ++{ ++ return atomic_read(&au_di(d)->di_generation); ++} ++ ++static inline void au_h_dentry_init(struct au_hdentry *hdentry) ++{ ++ hdentry->hd_dentry = NULL; ++} ++ ++static inline void au_hdput(struct au_hdentry *hd) ++{ ++ if (hd) ++ dput(hd->hd_dentry); ++} ++ ++static inline aufs_bindex_t au_dbtop(struct dentry *dentry) ++{ ++ DiMustAnyLock(dentry); ++ return au_di(dentry)->di_btop; ++} ++ ++static inline aufs_bindex_t au_dbbot(struct dentry *dentry) ++{ ++ DiMustAnyLock(dentry); ++ return au_di(dentry)->di_bbot; ++} ++ ++static inline aufs_bindex_t au_dbwh(struct dentry *dentry) ++{ ++ DiMustAnyLock(dentry); ++ return au_di(dentry)->di_bwh; ++} ++ ++static inline aufs_bindex_t au_dbdiropq(struct dentry *dentry) ++{ ++ DiMustAnyLock(dentry); ++ return au_di(dentry)->di_bdiropq; ++} ++ ++/* todo: hard/soft set? */ ++static inline void au_set_dbtop(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ DiMustWriteLock(dentry); ++ au_di(dentry)->di_btop = bindex; ++} ++ ++static inline void au_set_dbbot(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ DiMustWriteLock(dentry); ++ au_di(dentry)->di_bbot = bindex; ++} ++ ++static inline void au_set_dbwh(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ DiMustWriteLock(dentry); ++ /* dbwh can be outside of btop - bbot range */ ++ au_di(dentry)->di_bwh = bindex; ++} ++ ++static inline void au_set_dbdiropq(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ DiMustWriteLock(dentry); ++ au_di(dentry)->di_bdiropq = bindex; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_HNOTIFY ++static inline void au_digen_dec(struct dentry *d) ++{ ++ atomic_dec(&au_di(d)->di_generation); ++} ++ ++static inline void au_hn_di_reinit(struct dentry *dentry) ++{ ++ dentry->d_fsdata = NULL; ++} ++#else ++AuStubVoid(au_hn_di_reinit, struct dentry *dentry __maybe_unused) ++#endif /* CONFIG_AUFS_HNOTIFY */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DENTRY_H__ */ +diff --git a/fs/aufs/dinfo.c b/fs/aufs/dinfo.c +new file mode 100644 +index 0000000..ff8d5fd +--- /dev/null ++++ b/fs/aufs/dinfo.c +@@ -0,0 +1,548 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * dentry private data ++ */ ++ ++#include "aufs.h" ++ ++void au_di_init_once(void *_dinfo) ++{ ++ struct au_dinfo *dinfo = _dinfo; ++ ++ au_rw_init(&dinfo->di_rwsem); ++} ++ ++struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc) ++{ ++ struct au_dinfo *dinfo; ++ int nbr, i; ++ ++ dinfo = au_cache_alloc_dinfo(); ++ if (unlikely(!dinfo)) ++ goto out; ++ ++ nbr = au_sbbot(sb) + 1; ++ if (nbr <= 0) ++ nbr = 1; ++ dinfo->di_hdentry = kcalloc(nbr, sizeof(*dinfo->di_hdentry), GFP_NOFS); ++ if (dinfo->di_hdentry) { ++ au_rw_write_lock_nested(&dinfo->di_rwsem, lsc); ++ dinfo->di_btop = -1; ++ dinfo->di_bbot = -1; ++ dinfo->di_bwh = -1; ++ dinfo->di_bdiropq = -1; ++ dinfo->di_tmpfile = 0; ++ for (i = 0; i < nbr; i++) ++ dinfo->di_hdentry[i].hd_id = -1; ++ goto out; ++ } ++ ++ au_cache_free_dinfo(dinfo); ++ dinfo = NULL; ++ ++out: ++ return dinfo; ++} ++ ++void au_di_free(struct au_dinfo *dinfo) ++{ ++ struct au_hdentry *p; ++ aufs_bindex_t bbot, bindex; ++ ++ /* dentry may not be revalidated */ ++ bindex = dinfo->di_btop; ++ if (bindex >= 0) { ++ bbot = dinfo->di_bbot; ++ p = dinfo->di_hdentry + bindex; ++ while (bindex++ <= bbot) ++ au_hdput(p++); ++ } ++ kfree(dinfo->di_hdentry); ++ au_cache_free_dinfo(dinfo); ++} ++ ++void au_di_swap(struct au_dinfo *a, struct au_dinfo *b) ++{ ++ struct au_hdentry *p; ++ aufs_bindex_t bi; ++ ++ AuRwMustWriteLock(&a->di_rwsem); ++ AuRwMustWriteLock(&b->di_rwsem); ++ ++#define DiSwap(v, name) \ ++ do { \ ++ v = a->di_##name; \ ++ a->di_##name = b->di_##name; \ ++ b->di_##name = v; \ ++ } while (0) ++ ++ DiSwap(p, hdentry); ++ DiSwap(bi, btop); ++ DiSwap(bi, bbot); ++ DiSwap(bi, bwh); ++ DiSwap(bi, bdiropq); ++ /* smp_mb(); */ ++ ++#undef DiSwap ++} ++ ++void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src) ++{ ++ AuRwMustWriteLock(&dst->di_rwsem); ++ AuRwMustWriteLock(&src->di_rwsem); ++ ++ dst->di_btop = src->di_btop; ++ dst->di_bbot = src->di_bbot; ++ dst->di_bwh = src->di_bwh; ++ dst->di_bdiropq = src->di_bdiropq; ++ /* smp_mb(); */ ++} ++ ++int au_di_init(struct dentry *dentry) ++{ ++ int err; ++ struct super_block *sb; ++ struct au_dinfo *dinfo; ++ ++ err = 0; ++ sb = dentry->d_sb; ++ dinfo = au_di_alloc(sb, AuLsc_DI_CHILD); ++ if (dinfo) { ++ atomic_set(&dinfo->di_generation, au_sigen(sb)); ++ /* smp_mb(); */ /* atomic_set */ ++ dentry->d_fsdata = dinfo; ++ } else ++ err = -ENOMEM; ++ ++ return err; ++} ++ ++void au_di_fin(struct dentry *dentry) ++{ ++ struct au_dinfo *dinfo; ++ ++ dinfo = au_di(dentry); ++ AuRwDestroy(&dinfo->di_rwsem); ++ au_di_free(dinfo); ++} ++ ++int au_di_realloc(struct au_dinfo *dinfo, int nbr) ++{ ++ int err, sz; ++ struct au_hdentry *hdp; ++ ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ err = -ENOMEM; ++ sz = sizeof(*hdp) * (dinfo->di_bbot + 1); ++ if (!sz) ++ sz = sizeof(*hdp); ++ hdp = au_kzrealloc(dinfo->di_hdentry, sz, sizeof(*hdp) * nbr, GFP_NOFS); ++ if (hdp) { ++ dinfo->di_hdentry = hdp; ++ err = 0; ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void do_ii_write_lock(struct inode *inode, unsigned int lsc) ++{ ++ switch (lsc) { ++ case AuLsc_DI_CHILD: ++ ii_write_lock_child(inode); ++ break; ++ case AuLsc_DI_CHILD2: ++ ii_write_lock_child2(inode); ++ break; ++ case AuLsc_DI_CHILD3: ++ ii_write_lock_child3(inode); ++ break; ++ case AuLsc_DI_PARENT: ++ ii_write_lock_parent(inode); ++ break; ++ case AuLsc_DI_PARENT2: ++ ii_write_lock_parent2(inode); ++ break; ++ case AuLsc_DI_PARENT3: ++ ii_write_lock_parent3(inode); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static void do_ii_read_lock(struct inode *inode, unsigned int lsc) ++{ ++ switch (lsc) { ++ case AuLsc_DI_CHILD: ++ ii_read_lock_child(inode); ++ break; ++ case AuLsc_DI_CHILD2: ++ ii_read_lock_child2(inode); ++ break; ++ case AuLsc_DI_CHILD3: ++ ii_read_lock_child3(inode); ++ break; ++ case AuLsc_DI_PARENT: ++ ii_read_lock_parent(inode); ++ break; ++ case AuLsc_DI_PARENT2: ++ ii_read_lock_parent2(inode); ++ break; ++ case AuLsc_DI_PARENT3: ++ ii_read_lock_parent3(inode); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++void di_read_lock(struct dentry *d, int flags, unsigned int lsc) ++{ ++ struct inode *inode; ++ ++ au_rw_read_lock_nested(&au_di(d)->di_rwsem, lsc); ++ if (d_really_is_positive(d)) { ++ inode = d_inode(d); ++ if (au_ftest_lock(flags, IW)) ++ do_ii_write_lock(inode, lsc); ++ else if (au_ftest_lock(flags, IR)) ++ do_ii_read_lock(inode, lsc); ++ } ++} ++ ++void di_read_unlock(struct dentry *d, int flags) ++{ ++ struct inode *inode; ++ ++ if (d_really_is_positive(d)) { ++ inode = d_inode(d); ++ if (au_ftest_lock(flags, IW)) { ++ au_dbg_verify_dinode(d); ++ ii_write_unlock(inode); ++ } else if (au_ftest_lock(flags, IR)) { ++ au_dbg_verify_dinode(d); ++ ii_read_unlock(inode); ++ } ++ } ++ au_rw_read_unlock(&au_di(d)->di_rwsem); ++} ++ ++void di_downgrade_lock(struct dentry *d, int flags) ++{ ++ if (d_really_is_positive(d) && au_ftest_lock(flags, IR)) ++ ii_downgrade_lock(d_inode(d)); ++ au_rw_dgrade_lock(&au_di(d)->di_rwsem); ++} ++ ++void di_write_lock(struct dentry *d, unsigned int lsc) ++{ ++ au_rw_write_lock_nested(&au_di(d)->di_rwsem, lsc); ++ if (d_really_is_positive(d)) ++ do_ii_write_lock(d_inode(d), lsc); ++} ++ ++void di_write_unlock(struct dentry *d) ++{ ++ au_dbg_verify_dinode(d); ++ if (d_really_is_positive(d)) ++ ii_write_unlock(d_inode(d)); ++ au_rw_write_unlock(&au_di(d)->di_rwsem); ++} ++ ++void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir) ++{ ++ AuDebugOn(d1 == d2 ++ || d_inode(d1) == d_inode(d2) ++ || d1->d_sb != d2->d_sb); ++ ++ if (isdir && au_test_subdir(d1, d2)) { ++ di_write_lock_child(d1); ++ di_write_lock_child2(d2); ++ } else { ++ /* there should be no races */ ++ di_write_lock_child(d2); ++ di_write_lock_child2(d1); ++ } ++} ++ ++void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir) ++{ ++ AuDebugOn(d1 == d2 ++ || d_inode(d1) == d_inode(d2) ++ || d1->d_sb != d2->d_sb); ++ ++ if (isdir && au_test_subdir(d1, d2)) { ++ di_write_lock_parent(d1); ++ di_write_lock_parent2(d2); ++ } else { ++ /* there should be no races */ ++ di_write_lock_parent(d2); ++ di_write_lock_parent2(d1); ++ } ++} ++ ++void di_write_unlock2(struct dentry *d1, struct dentry *d2) ++{ ++ di_write_unlock(d1); ++ if (d_inode(d1) == d_inode(d2)) ++ au_rw_write_unlock(&au_di(d2)->di_rwsem); ++ else ++ di_write_unlock(d2); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ struct dentry *d; ++ ++ DiMustAnyLock(dentry); ++ ++ if (au_dbtop(dentry) < 0 || bindex < au_dbtop(dentry)) ++ return NULL; ++ AuDebugOn(bindex < 0); ++ d = au_di(dentry)->di_hdentry[0 + bindex].hd_dentry; ++ AuDebugOn(d && au_dcount(d) <= 0); ++ return d; ++} ++ ++/* ++ * extended version of au_h_dptr(). ++ * returns a hashed and positive (or linkable) h_dentry in bindex, NULL, or ++ * error. ++ */ ++struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ struct dentry *h_dentry; ++ struct inode *inode, *h_inode; ++ ++ AuDebugOn(d_really_is_negative(dentry)); ++ ++ h_dentry = NULL; ++ if (au_dbtop(dentry) <= bindex ++ && bindex <= au_dbbot(dentry)) ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry && !au_d_linkable(h_dentry)) { ++ dget(h_dentry); ++ goto out; /* success */ ++ } ++ ++ inode = d_inode(dentry); ++ AuDebugOn(bindex < au_ibtop(inode)); ++ AuDebugOn(au_ibbot(inode) < bindex); ++ h_inode = au_h_iptr(inode, bindex); ++ h_dentry = d_find_alias(h_inode); ++ if (h_dentry) { ++ if (!IS_ERR(h_dentry)) { ++ if (!au_d_linkable(h_dentry)) ++ goto out; /* success */ ++ dput(h_dentry); ++ } else ++ goto out; ++ } ++ ++ if (au_opt_test(au_mntflags(dentry->d_sb), PLINK)) { ++ h_dentry = au_plink_lkup(inode, bindex); ++ AuDebugOn(!h_dentry); ++ if (!IS_ERR(h_dentry)) { ++ if (!au_d_hashed_positive(h_dentry)) ++ goto out; /* success */ ++ dput(h_dentry); ++ h_dentry = NULL; ++ } ++ } ++ ++out: ++ AuDbgDentry(h_dentry); ++ return h_dentry; ++} ++ ++aufs_bindex_t au_dbtail(struct dentry *dentry) ++{ ++ aufs_bindex_t bbot, bwh; ++ ++ bbot = au_dbbot(dentry); ++ if (0 <= bbot) { ++ bwh = au_dbwh(dentry); ++ if (!bwh) ++ return bwh; ++ if (0 < bwh && bwh < bbot) ++ return bwh - 1; ++ } ++ return bbot; ++} ++ ++aufs_bindex_t au_dbtaildir(struct dentry *dentry) ++{ ++ aufs_bindex_t bbot, bopq; ++ ++ bbot = au_dbtail(dentry); ++ if (0 <= bbot) { ++ bopq = au_dbdiropq(dentry); ++ if (0 <= bopq && bopq < bbot) ++ bbot = bopq; ++ } ++ return bbot; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_dentry) ++{ ++ struct au_hdentry *hd = au_di(dentry)->di_hdentry + bindex; ++ struct au_branch *br; ++ ++ DiMustWriteLock(dentry); ++ ++ au_hdput(hd); ++ hd->hd_dentry = h_dentry; ++ if (h_dentry) { ++ br = au_sbr(dentry->d_sb, bindex); ++ hd->hd_id = br->br_id; ++ } ++} ++ ++int au_dbrange_test(struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t btop, bbot; ++ ++ err = 0; ++ btop = au_dbtop(dentry); ++ bbot = au_dbbot(dentry); ++ if (btop >= 0) ++ AuDebugOn(bbot < 0 && btop > bbot); ++ else { ++ err = -EIO; ++ AuDebugOn(bbot >= 0); ++ } ++ ++ return err; ++} ++ ++int au_digen_test(struct dentry *dentry, unsigned int sigen) ++{ ++ int err; ++ ++ err = 0; ++ if (unlikely(au_digen(dentry) != sigen ++ || au_iigen_test(d_inode(dentry), sigen))) ++ err = -EIO; ++ ++ return err; ++} ++ ++void au_update_digen(struct dentry *dentry) ++{ ++ atomic_set(&au_di(dentry)->di_generation, au_sigen(dentry->d_sb)); ++ /* smp_mb(); */ /* atomic_set */ ++} ++ ++void au_update_dbrange(struct dentry *dentry, int do_put_zero) ++{ ++ struct au_dinfo *dinfo; ++ struct dentry *h_d; ++ struct au_hdentry *hdp; ++ ++ DiMustWriteLock(dentry); ++ ++ dinfo = au_di(dentry); ++ if (!dinfo || dinfo->di_btop < 0) ++ return; ++ ++ hdp = dinfo->di_hdentry; ++ if (do_put_zero) { ++ aufs_bindex_t bindex, bbot; ++ ++ bbot = dinfo->di_bbot; ++ for (bindex = dinfo->di_btop; bindex <= bbot; bindex++) { ++ h_d = hdp[0 + bindex].hd_dentry; ++ if (h_d && d_is_negative(h_d)) ++ au_set_h_dptr(dentry, bindex, NULL); ++ } ++ } ++ ++ dinfo->di_btop = -1; ++ while (++dinfo->di_btop <= dinfo->di_bbot) ++ if (hdp[0 + dinfo->di_btop].hd_dentry) ++ break; ++ if (dinfo->di_btop > dinfo->di_bbot) { ++ dinfo->di_btop = -1; ++ dinfo->di_bbot = -1; ++ return; ++ } ++ ++ dinfo->di_bbot++; ++ while (0 <= --dinfo->di_bbot) ++ if (hdp[0 + dinfo->di_bbot].hd_dentry) ++ break; ++ AuDebugOn(dinfo->di_btop > dinfo->di_bbot || dinfo->di_bbot < 0); ++} ++ ++void au_update_dbtop(struct dentry *dentry) ++{ ++ aufs_bindex_t bindex, bbot; ++ struct dentry *h_dentry; ++ ++ bbot = au_dbbot(dentry); ++ for (bindex = au_dbtop(dentry); bindex <= bbot; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ if (d_is_positive(h_dentry)) { ++ au_set_dbtop(dentry, bindex); ++ return; ++ } ++ au_set_h_dptr(dentry, bindex, NULL); ++ } ++} ++ ++void au_update_dbbot(struct dentry *dentry) ++{ ++ aufs_bindex_t bindex, btop; ++ struct dentry *h_dentry; ++ ++ btop = au_dbtop(dentry); ++ for (bindex = au_dbbot(dentry); bindex >= btop; bindex--) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ if (d_is_positive(h_dentry)) { ++ au_set_dbbot(dentry, bindex); ++ return; ++ } ++ au_set_h_dptr(dentry, bindex, NULL); ++ } ++} ++ ++int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry) ++{ ++ aufs_bindex_t bindex, bbot; ++ ++ bbot = au_dbbot(dentry); ++ for (bindex = au_dbtop(dentry); bindex <= bbot; bindex++) ++ if (au_h_dptr(dentry, bindex) == h_dentry) ++ return bindex; ++ return -1; ++} +diff --git a/fs/aufs/dir.c b/fs/aufs/dir.c +new file mode 100644 +index 0000000..fd38c54 +--- /dev/null ++++ b/fs/aufs/dir.c +@@ -0,0 +1,756 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * directory operations ++ */ ++ ++#include ++#include "aufs.h" ++ ++void au_add_nlink(struct inode *dir, struct inode *h_dir) ++{ ++ unsigned int nlink; ++ ++ AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode)); ++ ++ nlink = dir->i_nlink; ++ nlink += h_dir->i_nlink - 2; ++ if (h_dir->i_nlink < 2) ++ nlink += 2; ++ smp_mb(); /* for i_nlink */ ++ /* 0 can happen in revaliding */ ++ set_nlink(dir, nlink); ++} ++ ++void au_sub_nlink(struct inode *dir, struct inode *h_dir) ++{ ++ unsigned int nlink; ++ ++ AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode)); ++ ++ nlink = dir->i_nlink; ++ nlink -= h_dir->i_nlink - 2; ++ if (h_dir->i_nlink < 2) ++ nlink -= 2; ++ smp_mb(); /* for i_nlink */ ++ /* nlink == 0 means the branch-fs is broken */ ++ set_nlink(dir, nlink); ++} ++ ++loff_t au_dir_size(struct file *file, struct dentry *dentry) ++{ ++ loff_t sz; ++ aufs_bindex_t bindex, bbot; ++ struct file *h_file; ++ struct dentry *h_dentry; ++ ++ sz = 0; ++ if (file) { ++ AuDebugOn(!d_is_dir(file->f_path.dentry)); ++ ++ bbot = au_fbbot_dir(file); ++ for (bindex = au_fbtop(file); ++ bindex <= bbot && sz < KMALLOC_MAX_SIZE; ++ bindex++) { ++ h_file = au_hf_dir(file, bindex); ++ if (h_file && file_inode(h_file)) ++ sz += vfsub_f_size_read(h_file); ++ } ++ } else { ++ AuDebugOn(!dentry); ++ AuDebugOn(!d_is_dir(dentry)); ++ ++ bbot = au_dbtaildir(dentry); ++ for (bindex = au_dbtop(dentry); ++ bindex <= bbot && sz < KMALLOC_MAX_SIZE; ++ bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry && d_is_positive(h_dentry)) ++ sz += i_size_read(d_inode(h_dentry)); ++ } ++ } ++ if (sz < KMALLOC_MAX_SIZE) ++ sz = roundup_pow_of_two(sz); ++ if (sz > KMALLOC_MAX_SIZE) ++ sz = KMALLOC_MAX_SIZE; ++ else if (sz < NAME_MAX) { ++ BUILD_BUG_ON(AUFS_RDBLK_DEF < NAME_MAX); ++ sz = AUFS_RDBLK_DEF; ++ } ++ return sz; ++} ++ ++struct au_dir_ts_arg { ++ struct dentry *dentry; ++ aufs_bindex_t brid; ++}; ++ ++static void au_do_dir_ts(void *arg) ++{ ++ struct au_dir_ts_arg *a = arg; ++ struct au_dtime dt; ++ struct path h_path; ++ struct inode *dir, *h_dir; ++ struct super_block *sb; ++ struct au_branch *br; ++ struct au_hinode *hdir; ++ int err; ++ aufs_bindex_t btop, bindex; ++ ++ sb = a->dentry->d_sb; ++ if (d_really_is_negative(a->dentry)) ++ goto out; ++ /* no dir->i_mutex lock */ ++ aufs_read_lock(a->dentry, AuLock_DW); /* noflush */ ++ ++ dir = d_inode(a->dentry); ++ btop = au_ibtop(dir); ++ bindex = au_br_index(sb, a->brid); ++ if (bindex < btop) ++ goto out_unlock; ++ ++ br = au_sbr(sb, bindex); ++ h_path.dentry = au_h_dptr(a->dentry, bindex); ++ if (!h_path.dentry) ++ goto out_unlock; ++ h_path.mnt = au_br_mnt(br); ++ au_dtime_store(&dt, a->dentry, &h_path); ++ ++ br = au_sbr(sb, btop); ++ if (!au_br_writable(br->br_perm)) ++ goto out_unlock; ++ h_path.dentry = au_h_dptr(a->dentry, btop); ++ h_path.mnt = au_br_mnt(br); ++ err = vfsub_mnt_want_write(h_path.mnt); ++ if (err) ++ goto out_unlock; ++ hdir = au_hi(dir, btop); ++ au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ h_dir = au_h_iptr(dir, btop); ++ if (h_dir->i_nlink ++ && timespec_compare(&h_dir->i_mtime, &dt.dt_mtime) < 0) { ++ dt.dt_h_path = h_path; ++ au_dtime_revert(&dt); ++ } ++ au_hn_imtx_unlock(hdir); ++ vfsub_mnt_drop_write(h_path.mnt); ++ au_cpup_attr_timesizes(dir); ++ ++out_unlock: ++ aufs_read_unlock(a->dentry, AuLock_DW); ++out: ++ dput(a->dentry); ++ au_nwt_done(&au_sbi(sb)->si_nowait); ++ kfree(arg); ++} ++ ++void au_dir_ts(struct inode *dir, aufs_bindex_t bindex) ++{ ++ int perm, wkq_err; ++ aufs_bindex_t btop; ++ struct au_dir_ts_arg *arg; ++ struct dentry *dentry; ++ struct super_block *sb; ++ ++ IMustLock(dir); ++ ++ dentry = d_find_any_alias(dir); ++ AuDebugOn(!dentry); ++ sb = dentry->d_sb; ++ btop = au_ibtop(dir); ++ if (btop == bindex) { ++ au_cpup_attr_timesizes(dir); ++ goto out; ++ } ++ ++ perm = au_sbr_perm(sb, btop); ++ if (!au_br_writable(perm)) ++ goto out; ++ ++ arg = kmalloc(sizeof(*arg), GFP_NOFS); ++ if (!arg) ++ goto out; ++ ++ arg->dentry = dget(dentry); /* will be dput-ted by au_do_dir_ts() */ ++ arg->brid = au_sbr_id(sb, bindex); ++ wkq_err = au_wkq_nowait(au_do_dir_ts, arg, sb, /*flags*/0); ++ if (unlikely(wkq_err)) { ++ pr_err("wkq %d\n", wkq_err); ++ dput(dentry); ++ kfree(arg); ++ } ++ ++out: ++ dput(dentry); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int reopen_dir(struct file *file) ++{ ++ int err; ++ unsigned int flags; ++ aufs_bindex_t bindex, btail, btop; ++ struct dentry *dentry, *h_dentry; ++ struct file *h_file; ++ ++ /* open all lower dirs */ ++ dentry = file->f_path.dentry; ++ btop = au_dbtop(dentry); ++ for (bindex = au_fbtop(file); bindex < btop; bindex++) ++ au_set_h_fptr(file, bindex, NULL); ++ au_set_fbtop(file, btop); ++ ++ btail = au_dbtaildir(dentry); ++ for (bindex = au_fbbot_dir(file); btail < bindex; bindex--) ++ au_set_h_fptr(file, bindex, NULL); ++ au_set_fbbot_dir(file, btail); ++ ++ flags = vfsub_file_flags(file); ++ for (bindex = btop; bindex <= btail; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ h_file = au_hf_dir(file, bindex); ++ if (h_file) ++ continue; ++ ++ h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; /* close all? */ ++ au_set_h_fptr(file, bindex, h_file); ++ } ++ au_update_figen(file); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ err = 0; ++ ++out: ++ return err; ++} ++ ++static int do_open_dir(struct file *file, int flags, struct file *h_file) ++{ ++ int err; ++ aufs_bindex_t bindex, btail; ++ struct dentry *dentry, *h_dentry; ++ struct vfsmount *mnt; ++ ++ FiMustWriteLock(file); ++ AuDebugOn(h_file); ++ ++ err = 0; ++ mnt = file->f_path.mnt; ++ dentry = file->f_path.dentry; ++ file->f_version = d_inode(dentry)->i_version; ++ bindex = au_dbtop(dentry); ++ au_set_fbtop(file, bindex); ++ btail = au_dbtaildir(dentry); ++ au_set_fbbot_dir(file, btail); ++ for (; !err && bindex <= btail; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ ++ err = vfsub_test_mntns(mnt, h_dentry->d_sb); ++ if (unlikely(err)) ++ break; ++ h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0); ++ if (IS_ERR(h_file)) { ++ err = PTR_ERR(h_file); ++ break; ++ } ++ au_set_h_fptr(file, bindex, h_file); ++ } ++ au_update_figen(file); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ if (!err) ++ return 0; /* success */ ++ ++ /* close all */ ++ for (bindex = au_fbtop(file); bindex <= btail; bindex++) ++ au_set_h_fptr(file, bindex, NULL); ++ au_set_fbtop(file, -1); ++ au_set_fbbot_dir(file, -1); ++ ++ return err; ++} ++ ++static int aufs_open_dir(struct inode *inode __maybe_unused, ++ struct file *file) ++{ ++ int err; ++ struct super_block *sb; ++ struct au_fidir *fidir; ++ ++ err = -ENOMEM; ++ sb = file->f_path.dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ fidir = au_fidir_alloc(sb); ++ if (fidir) { ++ struct au_do_open_args args = { ++ .open = do_open_dir, ++ .fidir = fidir ++ }; ++ err = au_do_open(file, &args); ++ if (unlikely(err)) ++ kfree(fidir); ++ } ++ si_read_unlock(sb); ++ return err; ++} ++ ++static int aufs_release_dir(struct inode *inode __maybe_unused, ++ struct file *file) ++{ ++ struct au_vdir *vdir_cache; ++ struct au_finfo *finfo; ++ struct au_fidir *fidir; ++ aufs_bindex_t bindex, bbot; ++ ++ finfo = au_fi(file); ++ fidir = finfo->fi_hdir; ++ if (fidir) { ++ au_sphl_del(&finfo->fi_hlist, ++ &au_sbi(file->f_path.dentry->d_sb)->si_files); ++ vdir_cache = fidir->fd_vdir_cache; /* lock-free */ ++ if (vdir_cache) ++ au_vdir_free(vdir_cache); ++ ++ bindex = finfo->fi_btop; ++ if (bindex >= 0) { ++ /* ++ * calls fput() instead of filp_close(), ++ * since no dnotify or lock for the lower file. ++ */ ++ bbot = fidir->fd_bbot; ++ for (; bindex <= bbot; bindex++) ++ au_set_h_fptr(file, bindex, NULL); ++ } ++ kfree(fidir); ++ finfo->fi_hdir = NULL; ++ } ++ au_finfo_fin(file); ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_do_flush_dir(struct file *file, fl_owner_t id) ++{ ++ int err; ++ aufs_bindex_t bindex, bbot; ++ struct file *h_file; ++ ++ err = 0; ++ bbot = au_fbbot_dir(file); ++ for (bindex = au_fbtop(file); !err && bindex <= bbot; bindex++) { ++ h_file = au_hf_dir(file, bindex); ++ if (h_file) ++ err = vfsub_flush(h_file, id); ++ } ++ return err; ++} ++ ++static int aufs_flush_dir(struct file *file, fl_owner_t id) ++{ ++ return au_do_flush(file, id, au_do_flush_dir); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_do_fsync_dir_no_file(struct dentry *dentry, int datasync) ++{ ++ int err; ++ aufs_bindex_t bbot, bindex; ++ struct inode *inode; ++ struct super_block *sb; ++ ++ err = 0; ++ sb = dentry->d_sb; ++ inode = d_inode(dentry); ++ IMustLock(inode); ++ bbot = au_dbbot(dentry); ++ for (bindex = au_dbtop(dentry); !err && bindex <= bbot; bindex++) { ++ struct path h_path; ++ ++ if (au_test_ro(sb, bindex, inode)) ++ continue; ++ h_path.dentry = au_h_dptr(dentry, bindex); ++ if (!h_path.dentry) ++ continue; ++ ++ h_path.mnt = au_sbr_mnt(sb, bindex); ++ err = vfsub_fsync(NULL, &h_path, datasync); ++ } ++ ++ return err; ++} ++ ++static int au_do_fsync_dir(struct file *file, int datasync) ++{ ++ int err; ++ aufs_bindex_t bbot, bindex; ++ struct file *h_file; ++ struct super_block *sb; ++ struct inode *inode; ++ ++ err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ inode = file_inode(file); ++ sb = inode->i_sb; ++ bbot = au_fbbot_dir(file); ++ for (bindex = au_fbtop(file); !err && bindex <= bbot; bindex++) { ++ h_file = au_hf_dir(file, bindex); ++ if (!h_file || au_test_ro(sb, bindex, inode)) ++ continue; ++ ++ err = vfsub_fsync(h_file, &h_file->f_path, datasync); ++ } ++ ++out: ++ return err; ++} ++ ++/* ++ * @file may be NULL ++ */ ++static int aufs_fsync_dir(struct file *file, loff_t start, loff_t end, ++ int datasync) ++{ ++ int err; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct super_block *sb; ++ ++ err = 0; ++ dentry = file->f_path.dentry; ++ inode = d_inode(dentry); ++ inode_lock(inode); ++ sb = dentry->d_sb; ++ si_noflush_read_lock(sb); ++ if (file) ++ err = au_do_fsync_dir(file, datasync); ++ else { ++ di_write_lock_child(dentry); ++ err = au_do_fsync_dir_no_file(dentry, datasync); ++ } ++ au_cpup_attr_timesizes(inode); ++ di_write_unlock(dentry); ++ if (file) ++ fi_write_unlock(file); ++ ++ si_read_unlock(sb); ++ inode_unlock(inode); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_iterate(struct file *file, struct dir_context *ctx) ++{ ++ int err; ++ struct dentry *dentry; ++ struct inode *inode, *h_inode; ++ struct super_block *sb; ++ ++ AuDbg("%pD, ctx{%pf, %llu}\n", file, ctx->actor, ctx->pos); ++ ++ dentry = file->f_path.dentry; ++ inode = d_inode(dentry); ++ IMustLock(inode); ++ ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ err = au_alive_dir(dentry); ++ if (!err) ++ err = au_vdir_init(file); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ h_inode = au_h_iptr(inode, au_ibtop(inode)); ++ if (!au_test_nfsd()) { ++ err = au_vdir_fill_de(file, ctx); ++ fsstack_copy_attr_atime(inode, h_inode); ++ } else { ++ /* ++ * nfsd filldir may call lookup_one_len(), vfs_getattr(), ++ * encode_fh() and others. ++ */ ++ atomic_inc(&h_inode->i_count); ++ di_read_unlock(dentry, AuLock_IR); ++ si_read_unlock(sb); ++ err = au_vdir_fill_de(file, ctx); ++ fsstack_copy_attr_atime(inode, h_inode); ++ fi_write_unlock(file); ++ iput(h_inode); ++ ++ AuTraceErr(err); ++ return err; ++ } ++ ++out_unlock: ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define AuTestEmpty_WHONLY 1 ++#define AuTestEmpty_CALLED (1 << 1) ++#define AuTestEmpty_SHWH (1 << 2) ++#define au_ftest_testempty(flags, name) ((flags) & AuTestEmpty_##name) ++#define au_fset_testempty(flags, name) \ ++ do { (flags) |= AuTestEmpty_##name; } while (0) ++#define au_fclr_testempty(flags, name) \ ++ do { (flags) &= ~AuTestEmpty_##name; } while (0) ++ ++#ifndef CONFIG_AUFS_SHWH ++#undef AuTestEmpty_SHWH ++#define AuTestEmpty_SHWH 0 ++#endif ++ ++struct test_empty_arg { ++ struct dir_context ctx; ++ struct au_nhash *whlist; ++ unsigned int flags; ++ int err; ++ aufs_bindex_t bindex; ++}; ++ ++static int test_empty_cb(struct dir_context *ctx, const char *__name, ++ int namelen, loff_t offset __maybe_unused, u64 ino, ++ unsigned int d_type) ++{ ++ struct test_empty_arg *arg = container_of(ctx, struct test_empty_arg, ++ ctx); ++ char *name = (void *)__name; ++ ++ arg->err = 0; ++ au_fset_testempty(arg->flags, CALLED); ++ /* smp_mb(); */ ++ if (name[0] == '.' ++ && (namelen == 1 || (name[1] == '.' && namelen == 2))) ++ goto out; /* success */ ++ ++ if (namelen <= AUFS_WH_PFX_LEN ++ || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { ++ if (au_ftest_testempty(arg->flags, WHONLY) ++ && !au_nhash_test_known_wh(arg->whlist, name, namelen)) ++ arg->err = -ENOTEMPTY; ++ goto out; ++ } ++ ++ name += AUFS_WH_PFX_LEN; ++ namelen -= AUFS_WH_PFX_LEN; ++ if (!au_nhash_test_known_wh(arg->whlist, name, namelen)) ++ arg->err = au_nhash_append_wh ++ (arg->whlist, name, namelen, ino, d_type, arg->bindex, ++ au_ftest_testempty(arg->flags, SHWH)); ++ ++out: ++ /* smp_mb(); */ ++ AuTraceErr(arg->err); ++ return arg->err; ++} ++ ++static int do_test_empty(struct dentry *dentry, struct test_empty_arg *arg) ++{ ++ int err; ++ struct file *h_file; ++ ++ h_file = au_h_open(dentry, arg->bindex, ++ O_RDONLY | O_NONBLOCK | O_DIRECTORY | O_LARGEFILE, ++ /*file*/NULL, /*force_wr*/0); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; ++ ++ err = 0; ++ if (!au_opt_test(au_mntflags(dentry->d_sb), UDBA_NONE) ++ && !file_inode(h_file)->i_nlink) ++ goto out_put; ++ ++ do { ++ arg->err = 0; ++ au_fclr_testempty(arg->flags, CALLED); ++ /* smp_mb(); */ ++ err = vfsub_iterate_dir(h_file, &arg->ctx); ++ if (err >= 0) ++ err = arg->err; ++ } while (!err && au_ftest_testempty(arg->flags, CALLED)); ++ ++out_put: ++ fput(h_file); ++ au_sbr_put(dentry->d_sb, arg->bindex); ++out: ++ return err; ++} ++ ++struct do_test_empty_args { ++ int *errp; ++ struct dentry *dentry; ++ struct test_empty_arg *arg; ++}; ++ ++static void call_do_test_empty(void *args) ++{ ++ struct do_test_empty_args *a = args; ++ *a->errp = do_test_empty(a->dentry, a->arg); ++} ++ ++static int sio_test_empty(struct dentry *dentry, struct test_empty_arg *arg) ++{ ++ int err, wkq_err; ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ ++ h_dentry = au_h_dptr(dentry, arg->bindex); ++ h_inode = d_inode(h_dentry); ++ /* todo: i_mode changes anytime? */ ++ inode_lock_nested(h_inode, AuLsc_I_CHILD); ++ err = au_test_h_perm_sio(h_inode, MAY_EXEC | MAY_READ); ++ inode_unlock(h_inode); ++ if (!err) ++ err = do_test_empty(dentry, arg); ++ else { ++ struct do_test_empty_args args = { ++ .errp = &err, ++ .dentry = dentry, ++ .arg = arg ++ }; ++ unsigned int flags = arg->flags; ++ ++ wkq_err = au_wkq_wait(call_do_test_empty, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ arg->flags = flags; ++ } ++ ++ return err; ++} ++ ++int au_test_empty_lower(struct dentry *dentry) ++{ ++ int err; ++ unsigned int rdhash; ++ aufs_bindex_t bindex, btop, btail; ++ struct au_nhash whlist; ++ struct test_empty_arg arg = { ++ .ctx = { ++ .actor = test_empty_cb ++ } ++ }; ++ int (*test_empty)(struct dentry *dentry, struct test_empty_arg *arg); ++ ++ SiMustAnyLock(dentry->d_sb); ++ ++ rdhash = au_sbi(dentry->d_sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, dentry)); ++ err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ ++ arg.flags = 0; ++ arg.whlist = &whlist; ++ btop = au_dbtop(dentry); ++ if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)) ++ au_fset_testempty(arg.flags, SHWH); ++ test_empty = do_test_empty; ++ if (au_opt_test(au_mntflags(dentry->d_sb), DIRPERM1)) ++ test_empty = sio_test_empty; ++ arg.bindex = btop; ++ err = test_empty(dentry, &arg); ++ if (unlikely(err)) ++ goto out_whlist; ++ ++ au_fset_testempty(arg.flags, WHONLY); ++ btail = au_dbtaildir(dentry); ++ for (bindex = btop + 1; !err && bindex <= btail; bindex++) { ++ struct dentry *h_dentry; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry && d_is_positive(h_dentry)) { ++ arg.bindex = bindex; ++ err = test_empty(dentry, &arg); ++ } ++ } ++ ++out_whlist: ++ au_nhash_wh_free(&whlist); ++out: ++ return err; ++} ++ ++int au_test_empty(struct dentry *dentry, struct au_nhash *whlist) ++{ ++ int err; ++ struct test_empty_arg arg = { ++ .ctx = { ++ .actor = test_empty_cb ++ } ++ }; ++ aufs_bindex_t bindex, btail; ++ ++ err = 0; ++ arg.whlist = whlist; ++ arg.flags = AuTestEmpty_WHONLY; ++ if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)) ++ au_fset_testempty(arg.flags, SHWH); ++ btail = au_dbtaildir(dentry); ++ for (bindex = au_dbtop(dentry); !err && bindex <= btail; bindex++) { ++ struct dentry *h_dentry; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry && d_is_positive(h_dentry)) { ++ arg.bindex = bindex; ++ err = sio_test_empty(dentry, &arg); ++ } ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++const struct file_operations aufs_dir_fop = { ++ .owner = THIS_MODULE, ++ .llseek = default_llseek, ++ .read = generic_read_dir, ++ .iterate = aufs_iterate, ++ .unlocked_ioctl = aufs_ioctl_dir, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = aufs_compat_ioctl_dir, ++#endif ++ .open = aufs_open_dir, ++ .release = aufs_release_dir, ++ .flush = aufs_flush_dir, ++ .fsync = aufs_fsync_dir ++}; +diff --git a/fs/aufs/dir.h b/fs/aufs/dir.h +new file mode 100644 +index 0000000..16821f9 +--- /dev/null ++++ b/fs/aufs/dir.h +@@ -0,0 +1,131 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * directory operations ++ */ ++ ++#ifndef __AUFS_DIR_H__ ++#define __AUFS_DIR_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* need to be faster and smaller */ ++ ++struct au_nhash { ++ unsigned int nh_num; ++ struct hlist_head *nh_head; ++}; ++ ++struct au_vdir_destr { ++ unsigned char len; ++ unsigned char name[0]; ++} __packed; ++ ++struct au_vdir_dehstr { ++ struct hlist_node hash; ++ struct au_vdir_destr *str; ++} ____cacheline_aligned_in_smp; ++ ++struct au_vdir_de { ++ ino_t de_ino; ++ unsigned char de_type; ++ /* caution: packed */ ++ struct au_vdir_destr de_str; ++} __packed; ++ ++struct au_vdir_wh { ++ struct hlist_node wh_hash; ++#ifdef CONFIG_AUFS_SHWH ++ ino_t wh_ino; ++ aufs_bindex_t wh_bindex; ++ unsigned char wh_type; ++#else ++ aufs_bindex_t wh_bindex; ++#endif ++ /* caution: packed */ ++ struct au_vdir_destr wh_str; ++} __packed; ++ ++union au_vdir_deblk_p { ++ unsigned char *deblk; ++ struct au_vdir_de *de; ++}; ++ ++struct au_vdir { ++ unsigned char **vd_deblk; ++ unsigned long vd_nblk; ++ struct { ++ unsigned long ul; ++ union au_vdir_deblk_p p; ++ } vd_last; ++ ++ unsigned long vd_version; ++ unsigned int vd_deblk_sz; ++ unsigned long vd_jiffy; ++} ____cacheline_aligned_in_smp; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* dir.c */ ++extern const struct file_operations aufs_dir_fop; ++void au_add_nlink(struct inode *dir, struct inode *h_dir); ++void au_sub_nlink(struct inode *dir, struct inode *h_dir); ++loff_t au_dir_size(struct file *file, struct dentry *dentry); ++void au_dir_ts(struct inode *dir, aufs_bindex_t bsrc); ++int au_test_empty_lower(struct dentry *dentry); ++int au_test_empty(struct dentry *dentry, struct au_nhash *whlist); ++ ++/* vdir.c */ ++unsigned int au_rdhash_est(loff_t sz); ++int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp); ++void au_nhash_wh_free(struct au_nhash *whlist); ++int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt, ++ int limit); ++int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen); ++int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino, ++ unsigned int d_type, aufs_bindex_t bindex, ++ unsigned char shwh); ++void au_vdir_free(struct au_vdir *vdir); ++int au_vdir_init(struct file *file); ++int au_vdir_fill_de(struct file *file, struct dir_context *ctx); ++ ++/* ioctl.c */ ++long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg); ++ ++#ifdef CONFIG_AUFS_RDU ++/* rdu.c */ ++long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg); ++#ifdef CONFIG_COMPAT ++long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, ++ unsigned long arg); ++#endif ++#else ++AuStub(long, au_rdu_ioctl, return -EINVAL, struct file *file, ++ unsigned int cmd, unsigned long arg) ++#ifdef CONFIG_COMPAT ++AuStub(long, au_rdu_compat_ioctl, return -EINVAL, struct file *file, ++ unsigned int cmd, unsigned long arg) ++#endif ++#endif ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DIR_H__ */ +diff --git a/fs/aufs/dynop.c b/fs/aufs/dynop.c +new file mode 100644 +index 0000000..e0c77bc +--- /dev/null ++++ b/fs/aufs/dynop.c +@@ -0,0 +1,369 @@ ++/* ++ * Copyright (C) 2010-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * dynamically customizable operations for regular files ++ */ ++ ++#include "aufs.h" ++ ++#define DyPrSym(key) AuDbgSym(key->dk_op.dy_hop) ++ ++/* ++ * How large will these lists be? ++ * Usually just a few elements, 20-30 at most for each, I guess. ++ */ ++static struct au_splhead dynop[AuDyLast]; ++ ++static struct au_dykey *dy_gfind_get(struct au_splhead *spl, const void *h_op) ++{ ++ struct au_dykey *key, *tmp; ++ struct list_head *head; ++ ++ key = NULL; ++ head = &spl->head; ++ rcu_read_lock(); ++ list_for_each_entry_rcu(tmp, head, dk_list) ++ if (tmp->dk_op.dy_hop == h_op) { ++ key = tmp; ++ kref_get(&key->dk_kref); ++ break; ++ } ++ rcu_read_unlock(); ++ ++ return key; ++} ++ ++static struct au_dykey *dy_bradd(struct au_branch *br, struct au_dykey *key) ++{ ++ struct au_dykey **k, *found; ++ const void *h_op = key->dk_op.dy_hop; ++ int i; ++ ++ found = NULL; ++ k = br->br_dykey; ++ for (i = 0; i < AuBrDynOp; i++) ++ if (k[i]) { ++ if (k[i]->dk_op.dy_hop == h_op) { ++ found = k[i]; ++ break; ++ } ++ } else ++ break; ++ if (!found) { ++ spin_lock(&br->br_dykey_lock); ++ for (; i < AuBrDynOp; i++) ++ if (k[i]) { ++ if (k[i]->dk_op.dy_hop == h_op) { ++ found = k[i]; ++ break; ++ } ++ } else { ++ k[i] = key; ++ break; ++ } ++ spin_unlock(&br->br_dykey_lock); ++ BUG_ON(i == AuBrDynOp); /* expand the array */ ++ } ++ ++ return found; ++} ++ ++/* kref_get() if @key is already added */ ++static struct au_dykey *dy_gadd(struct au_splhead *spl, struct au_dykey *key) ++{ ++ struct au_dykey *tmp, *found; ++ struct list_head *head; ++ const void *h_op = key->dk_op.dy_hop; ++ ++ found = NULL; ++ head = &spl->head; ++ spin_lock(&spl->spin); ++ list_for_each_entry(tmp, head, dk_list) ++ if (tmp->dk_op.dy_hop == h_op) { ++ kref_get(&tmp->dk_kref); ++ found = tmp; ++ break; ++ } ++ if (!found) ++ list_add_rcu(&key->dk_list, head); ++ spin_unlock(&spl->spin); ++ ++ if (!found) ++ DyPrSym(key); ++ return found; ++} ++ ++static void dy_free_rcu(struct rcu_head *rcu) ++{ ++ struct au_dykey *key; ++ ++ key = container_of(rcu, struct au_dykey, dk_rcu); ++ DyPrSym(key); ++ kfree(key); ++} ++ ++static void dy_free(struct kref *kref) ++{ ++ struct au_dykey *key; ++ struct au_splhead *spl; ++ ++ key = container_of(kref, struct au_dykey, dk_kref); ++ spl = dynop + key->dk_op.dy_type; ++ au_spl_del_rcu(&key->dk_list, spl); ++ call_rcu(&key->dk_rcu, dy_free_rcu); ++} ++ ++void au_dy_put(struct au_dykey *key) ++{ ++ kref_put(&key->dk_kref, dy_free); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define DyDbgSize(cnt, op) AuDebugOn(cnt != sizeof(op)/sizeof(void *)) ++ ++#ifdef CONFIG_AUFS_DEBUG ++#define DyDbgDeclare(cnt) unsigned int cnt = 0 ++#define DyDbgInc(cnt) do { cnt++; } while (0) ++#else ++#define DyDbgDeclare(cnt) do {} while (0) ++#define DyDbgInc(cnt) do {} while (0) ++#endif ++ ++#define DySet(func, dst, src, h_op, h_sb) do { \ ++ DyDbgInc(cnt); \ ++ if (h_op->func) { \ ++ if (src.func) \ ++ dst.func = src.func; \ ++ else \ ++ AuDbg("%s %s\n", au_sbtype(h_sb), #func); \ ++ } \ ++} while (0) ++ ++#define DySetForce(func, dst, src) do { \ ++ AuDebugOn(!src.func); \ ++ DyDbgInc(cnt); \ ++ dst.func = src.func; \ ++} while (0) ++ ++#define DySetAop(func) \ ++ DySet(func, dyaop->da_op, aufs_aop, h_aop, h_sb) ++#define DySetAopForce(func) \ ++ DySetForce(func, dyaop->da_op, aufs_aop) ++ ++static void dy_aop(struct au_dykey *key, const void *h_op, ++ struct super_block *h_sb __maybe_unused) ++{ ++ struct au_dyaop *dyaop = (void *)key; ++ const struct address_space_operations *h_aop = h_op; ++ DyDbgDeclare(cnt); ++ ++ AuDbg("%s\n", au_sbtype(h_sb)); ++ ++ DySetAop(writepage); ++ DySetAopForce(readpage); /* force */ ++ DySetAop(writepages); ++ DySetAop(set_page_dirty); ++ DySetAop(readpages); ++ DySetAop(write_begin); ++ DySetAop(write_end); ++ DySetAop(bmap); ++ DySetAop(invalidatepage); ++ DySetAop(releasepage); ++ DySetAop(freepage); ++ /* this one will be changed according to an aufs mount option */ ++ DySetAop(direct_IO); ++ DySetAop(migratepage); ++ DySetAop(launder_page); ++ DySetAop(is_partially_uptodate); ++ DySetAop(is_dirty_writeback); ++ DySetAop(error_remove_page); ++ DySetAop(swap_activate); ++ DySetAop(swap_deactivate); ++ ++ DyDbgSize(cnt, *h_aop); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void dy_bug(struct kref *kref) ++{ ++ BUG(); ++} ++ ++static struct au_dykey *dy_get(struct au_dynop *op, struct au_branch *br) ++{ ++ struct au_dykey *key, *old; ++ struct au_splhead *spl; ++ struct op { ++ unsigned int sz; ++ void (*set)(struct au_dykey *key, const void *h_op, ++ struct super_block *h_sb __maybe_unused); ++ }; ++ static const struct op a[] = { ++ [AuDy_AOP] = { ++ .sz = sizeof(struct au_dyaop), ++ .set = dy_aop ++ } ++ }; ++ const struct op *p; ++ ++ spl = dynop + op->dy_type; ++ key = dy_gfind_get(spl, op->dy_hop); ++ if (key) ++ goto out_add; /* success */ ++ ++ p = a + op->dy_type; ++ key = kzalloc(p->sz, GFP_NOFS); ++ if (unlikely(!key)) { ++ key = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ key->dk_op.dy_hop = op->dy_hop; ++ kref_init(&key->dk_kref); ++ p->set(key, op->dy_hop, au_br_sb(br)); ++ old = dy_gadd(spl, key); ++ if (old) { ++ kfree(key); ++ key = old; ++ } ++ ++out_add: ++ old = dy_bradd(br, key); ++ if (old) ++ /* its ref-count should never be zero here */ ++ kref_put(&key->dk_kref, dy_bug); ++out: ++ return key; ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * Aufs prohibits O_DIRECT by defaut even if the branch supports it. ++ * This behaviour is necessary to return an error from open(O_DIRECT) instead ++ * of the succeeding I/O. The dio mount option enables O_DIRECT and makes ++ * open(O_DIRECT) always succeed, but the succeeding I/O may return an error. ++ * See the aufs manual in detail. ++ */ ++static void dy_adx(struct au_dyaop *dyaop, int do_dx) ++{ ++ if (!do_dx) ++ dyaop->da_op.direct_IO = NULL; ++ else ++ dyaop->da_op.direct_IO = aufs_aop.direct_IO; ++} ++ ++static struct au_dyaop *dy_aget(struct au_branch *br, ++ const struct address_space_operations *h_aop, ++ int do_dx) ++{ ++ struct au_dyaop *dyaop; ++ struct au_dynop op; ++ ++ op.dy_type = AuDy_AOP; ++ op.dy_haop = h_aop; ++ dyaop = (void *)dy_get(&op, br); ++ if (IS_ERR(dyaop)) ++ goto out; ++ dy_adx(dyaop, do_dx); ++ ++out: ++ return dyaop; ++} ++ ++int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex, ++ struct inode *h_inode) ++{ ++ int err, do_dx; ++ struct super_block *sb; ++ struct au_branch *br; ++ struct au_dyaop *dyaop; ++ ++ AuDebugOn(!S_ISREG(h_inode->i_mode)); ++ IiMustWriteLock(inode); ++ ++ sb = inode->i_sb; ++ br = au_sbr(sb, bindex); ++ do_dx = !!au_opt_test(au_mntflags(sb), DIO); ++ dyaop = dy_aget(br, h_inode->i_mapping->a_ops, do_dx); ++ err = PTR_ERR(dyaop); ++ if (IS_ERR(dyaop)) ++ /* unnecessary to call dy_fput() */ ++ goto out; ++ ++ err = 0; ++ inode->i_mapping->a_ops = &dyaop->da_op; ++ ++out: ++ return err; ++} ++ ++/* ++ * Is it safe to replace a_ops during the inode/file is in operation? ++ * Yes, I hope so. ++ */ ++int au_dy_irefresh(struct inode *inode) ++{ ++ int err; ++ aufs_bindex_t btop; ++ struct inode *h_inode; ++ ++ err = 0; ++ if (S_ISREG(inode->i_mode)) { ++ btop = au_ibtop(inode); ++ h_inode = au_h_iptr(inode, btop); ++ err = au_dy_iaop(inode, btop, h_inode); ++ } ++ return err; ++} ++ ++void au_dy_arefresh(int do_dx) ++{ ++ struct au_splhead *spl; ++ struct list_head *head; ++ struct au_dykey *key; ++ ++ spl = dynop + AuDy_AOP; ++ head = &spl->head; ++ spin_lock(&spl->spin); ++ list_for_each_entry(key, head, dk_list) ++ dy_adx((void *)key, do_dx); ++ spin_unlock(&spl->spin); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void __init au_dy_init(void) ++{ ++ int i; ++ ++ /* make sure that 'struct au_dykey *' can be any type */ ++ BUILD_BUG_ON(offsetof(struct au_dyaop, da_key)); ++ ++ for (i = 0; i < AuDyLast; i++) ++ au_spl_init(dynop + i); ++} ++ ++void au_dy_fin(void) ++{ ++ int i; ++ ++ for (i = 0; i < AuDyLast; i++) ++ WARN_ON(!list_empty(&dynop[i].head)); ++} +diff --git a/fs/aufs/dynop.h b/fs/aufs/dynop.h +new file mode 100644 +index 0000000..92f61f4 +--- /dev/null ++++ b/fs/aufs/dynop.h +@@ -0,0 +1,74 @@ ++/* ++ * Copyright (C) 2010-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * dynamically customizable operations (for regular files only) ++ */ ++ ++#ifndef __AUFS_DYNOP_H__ ++#define __AUFS_DYNOP_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++ ++enum {AuDy_AOP, AuDyLast}; ++ ++struct au_dynop { ++ int dy_type; ++ union { ++ const void *dy_hop; ++ const struct address_space_operations *dy_haop; ++ }; ++}; ++ ++struct au_dykey { ++ union { ++ struct list_head dk_list; ++ struct rcu_head dk_rcu; ++ }; ++ struct au_dynop dk_op; ++ ++ /* ++ * during I am in the branch local array, kref is gotten. when the ++ * branch is removed, kref is put. ++ */ ++ struct kref dk_kref; ++}; ++ ++/* stop unioning since their sizes are very different from each other */ ++struct au_dyaop { ++ struct au_dykey da_key; ++ struct address_space_operations da_op; /* not const */ ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* dynop.c */ ++struct au_branch; ++void au_dy_put(struct au_dykey *key); ++int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex, ++ struct inode *h_inode); ++int au_dy_irefresh(struct inode *inode); ++void au_dy_arefresh(int do_dio); ++ ++void __init au_dy_init(void); ++void au_dy_fin(void); ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DYNOP_H__ */ +diff --git a/fs/aufs/export.c b/fs/aufs/export.c +new file mode 100644 +index 0000000..dcdfe1d +--- /dev/null ++++ b/fs/aufs/export.c +@@ -0,0 +1,837 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * export via nfs ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include "../fs/mount.h" ++#include "aufs.h" ++ ++union conv { ++#ifdef CONFIG_AUFS_INO_T_64 ++ __u32 a[2]; ++#else ++ __u32 a[1]; ++#endif ++ ino_t ino; ++}; ++ ++static ino_t decode_ino(__u32 *a) ++{ ++ union conv u; ++ ++ BUILD_BUG_ON(sizeof(u.ino) != sizeof(u.a)); ++ u.a[0] = a[0]; ++#ifdef CONFIG_AUFS_INO_T_64 ++ u.a[1] = a[1]; ++#endif ++ return u.ino; ++} ++ ++static void encode_ino(__u32 *a, ino_t ino) ++{ ++ union conv u; ++ ++ u.ino = ino; ++ a[0] = u.a[0]; ++#ifdef CONFIG_AUFS_INO_T_64 ++ a[1] = u.a[1]; ++#endif ++} ++ ++/* NFS file handle */ ++enum { ++ Fh_br_id, ++ Fh_sigen, ++#ifdef CONFIG_AUFS_INO_T_64 ++ /* support 64bit inode number */ ++ Fh_ino1, ++ Fh_ino2, ++ Fh_dir_ino1, ++ Fh_dir_ino2, ++#else ++ Fh_ino1, ++ Fh_dir_ino1, ++#endif ++ Fh_igen, ++ Fh_h_type, ++ Fh_tail, ++ ++ Fh_ino = Fh_ino1, ++ Fh_dir_ino = Fh_dir_ino1 ++}; ++ ++static int au_test_anon(struct dentry *dentry) ++{ ++ /* note: read d_flags without d_lock */ ++ return !!(dentry->d_flags & DCACHE_DISCONNECTED); ++} ++ ++int au_test_nfsd(void) ++{ ++ int ret; ++ struct task_struct *tsk = current; ++ char comm[sizeof(tsk->comm)]; ++ ++ ret = 0; ++ if (tsk->flags & PF_KTHREAD) { ++ get_task_comm(comm, tsk); ++ ret = !strcmp(comm, "nfsd"); ++ } ++ ++ return ret; ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* inode generation external table */ ++ ++void au_xigen_inc(struct inode *inode) ++{ ++ loff_t pos; ++ ssize_t sz; ++ __u32 igen; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ ++ sb = inode->i_sb; ++ AuDebugOn(!au_opt_test(au_mntflags(sb), XINO)); ++ ++ sbinfo = au_sbi(sb); ++ pos = inode->i_ino; ++ pos *= sizeof(igen); ++ igen = inode->i_generation + 1; ++ sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xigen, &igen, ++ sizeof(igen), &pos); ++ if (sz == sizeof(igen)) ++ return; /* success */ ++ ++ if (unlikely(sz >= 0)) ++ AuIOErr("xigen error (%zd)\n", sz); ++} ++ ++int au_xigen_new(struct inode *inode) ++{ ++ int err; ++ loff_t pos; ++ ssize_t sz; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ ++ err = 0; ++ /* todo: dirty, at mount time */ ++ if (inode->i_ino == AUFS_ROOT_INO) ++ goto out; ++ sb = inode->i_sb; ++ SiMustAnyLock(sb); ++ if (unlikely(!au_opt_test(au_mntflags(sb), XINO))) ++ goto out; ++ ++ err = -EFBIG; ++ pos = inode->i_ino; ++ if (unlikely(au_loff_max / sizeof(inode->i_generation) - 1 < pos)) { ++ AuIOErr1("too large i%lld\n", pos); ++ goto out; ++ } ++ pos *= sizeof(inode->i_generation); ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ file = sbinfo->si_xigen; ++ BUG_ON(!file); ++ ++ if (vfsub_f_size_read(file) ++ < pos + sizeof(inode->i_generation)) { ++ inode->i_generation = atomic_inc_return(&sbinfo->si_xigen_next); ++ sz = xino_fwrite(sbinfo->si_xwrite, file, &inode->i_generation, ++ sizeof(inode->i_generation), &pos); ++ } else ++ sz = xino_fread(sbinfo->si_xread, file, &inode->i_generation, ++ sizeof(inode->i_generation), &pos); ++ if (sz == sizeof(inode->i_generation)) ++ goto out; /* success */ ++ ++ err = sz; ++ if (unlikely(sz >= 0)) { ++ err = -EIO; ++ AuIOErr("xigen error (%zd)\n", sz); ++ } ++ ++out: ++ return err; ++} ++ ++int au_xigen_set(struct super_block *sb, struct file *base) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ file = au_xino_create2(base, sbinfo->si_xigen); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ err = 0; ++ if (sbinfo->si_xigen) ++ fput(sbinfo->si_xigen); ++ sbinfo->si_xigen = file; ++ ++out: ++ return err; ++} ++ ++void au_xigen_clr(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ if (sbinfo->si_xigen) { ++ fput(sbinfo->si_xigen); ++ sbinfo->si_xigen = NULL; ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct dentry *decode_by_ino(struct super_block *sb, ino_t ino, ++ ino_t dir_ino) ++{ ++ struct dentry *dentry, *d; ++ struct inode *inode; ++ unsigned int sigen; ++ ++ dentry = NULL; ++ inode = ilookup(sb, ino); ++ if (!inode) ++ goto out; ++ ++ dentry = ERR_PTR(-ESTALE); ++ sigen = au_sigen(sb); ++ if (unlikely(is_bad_inode(inode) ++ || IS_DEADDIR(inode) ++ || sigen != au_iigen(inode, NULL))) ++ goto out_iput; ++ ++ dentry = NULL; ++ if (!dir_ino || S_ISDIR(inode->i_mode)) ++ dentry = d_find_alias(inode); ++ else { ++ spin_lock(&inode->i_lock); ++ hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias) { ++ spin_lock(&d->d_lock); ++ if (!au_test_anon(d) ++ && d_inode(d->d_parent)->i_ino == dir_ino) { ++ dentry = dget_dlock(d); ++ spin_unlock(&d->d_lock); ++ break; ++ } ++ spin_unlock(&d->d_lock); ++ } ++ spin_unlock(&inode->i_lock); ++ } ++ if (unlikely(dentry && au_digen_test(dentry, sigen))) { ++ /* need to refresh */ ++ dput(dentry); ++ dentry = NULL; ++ } ++ ++out_iput: ++ iput(inode); ++out: ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* todo: dirty? */ ++/* if exportfs_decode_fh() passed vfsmount*, we could be happy */ ++ ++struct au_compare_mnt_args { ++ /* input */ ++ struct super_block *sb; ++ ++ /* output */ ++ struct vfsmount *mnt; ++}; ++ ++static int au_compare_mnt(struct vfsmount *mnt, void *arg) ++{ ++ struct au_compare_mnt_args *a = arg; ++ ++ if (mnt->mnt_sb != a->sb) ++ return 0; ++ a->mnt = mntget(mnt); ++ return 1; ++} ++ ++static struct vfsmount *au_mnt_get(struct super_block *sb) ++{ ++ int err; ++ struct path root; ++ struct au_compare_mnt_args args = { ++ .sb = sb ++ }; ++ ++ get_fs_root(current->fs, &root); ++ rcu_read_lock(); ++ err = iterate_mounts(au_compare_mnt, &args, root.mnt); ++ rcu_read_unlock(); ++ path_put(&root); ++ AuDebugOn(!err); ++ AuDebugOn(!args.mnt); ++ return args.mnt; ++} ++ ++struct au_nfsd_si_lock { ++ unsigned int sigen; ++ aufs_bindex_t bindex, br_id; ++ unsigned char force_lock; ++}; ++ ++static int si_nfsd_read_lock(struct super_block *sb, ++ struct au_nfsd_si_lock *nsi_lock) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ ++ si_read_lock(sb, AuLock_FLUSH); ++ ++ /* branch id may be wrapped around */ ++ err = 0; ++ bindex = au_br_index(sb, nsi_lock->br_id); ++ if (bindex >= 0 && nsi_lock->sigen + AUFS_BRANCH_MAX > au_sigen(sb)) ++ goto out; /* success */ ++ ++ err = -ESTALE; ++ bindex = -1; ++ if (!nsi_lock->force_lock) ++ si_read_unlock(sb); ++ ++out: ++ nsi_lock->bindex = bindex; ++ return err; ++} ++ ++struct find_name_by_ino { ++ struct dir_context ctx; ++ int called, found; ++ ino_t ino; ++ char *name; ++ int namelen; ++}; ++ ++static int ++find_name_by_ino(struct dir_context *ctx, const char *name, int namelen, ++ loff_t offset, u64 ino, unsigned int d_type) ++{ ++ struct find_name_by_ino *a = container_of(ctx, struct find_name_by_ino, ++ ctx); ++ ++ a->called++; ++ if (a->ino != ino) ++ return 0; ++ ++ memcpy(a->name, name, namelen); ++ a->namelen = namelen; ++ a->found = 1; ++ return 1; ++} ++ ++static struct dentry *au_lkup_by_ino(struct path *path, ino_t ino, ++ struct au_nfsd_si_lock *nsi_lock) ++{ ++ struct dentry *dentry, *parent; ++ struct file *file; ++ struct inode *dir; ++ struct find_name_by_ino arg = { ++ .ctx = { ++ .actor = find_name_by_ino ++ } ++ }; ++ int err; ++ ++ parent = path->dentry; ++ if (nsi_lock) ++ si_read_unlock(parent->d_sb); ++ file = vfsub_dentry_open(path, au_dir_roflags); ++ dentry = (void *)file; ++ if (IS_ERR(file)) ++ goto out; ++ ++ dentry = ERR_PTR(-ENOMEM); ++ arg.name = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!arg.name)) ++ goto out_file; ++ arg.ino = ino; ++ arg.found = 0; ++ do { ++ arg.called = 0; ++ /* smp_mb(); */ ++ err = vfsub_iterate_dir(file, &arg.ctx); ++ } while (!err && !arg.found && arg.called); ++ dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_name; ++ /* instead of ENOENT */ ++ dentry = ERR_PTR(-ESTALE); ++ if (!arg.found) ++ goto out_name; ++ ++ /* do not call vfsub_lkup_one() */ ++ dir = d_inode(parent); ++ dentry = vfsub_lookup_one_len_unlocked(arg.name, parent, arg.namelen); ++ AuTraceErrPtr(dentry); ++ if (IS_ERR(dentry)) ++ goto out_name; ++ AuDebugOn(au_test_anon(dentry)); ++ if (unlikely(d_really_is_negative(dentry))) { ++ dput(dentry); ++ dentry = ERR_PTR(-ENOENT); ++ } ++ ++out_name: ++ free_page((unsigned long)arg.name); ++out_file: ++ fput(file); ++out: ++ if (unlikely(nsi_lock ++ && si_nfsd_read_lock(parent->d_sb, nsi_lock) < 0)) ++ if (!IS_ERR(dentry)) { ++ dput(dentry); ++ dentry = ERR_PTR(-ESTALE); ++ } ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++static struct dentry *decode_by_dir_ino(struct super_block *sb, ino_t ino, ++ ino_t dir_ino, ++ struct au_nfsd_si_lock *nsi_lock) ++{ ++ struct dentry *dentry; ++ struct path path; ++ ++ if (dir_ino != AUFS_ROOT_INO) { ++ path.dentry = decode_by_ino(sb, dir_ino, 0); ++ dentry = path.dentry; ++ if (!path.dentry || IS_ERR(path.dentry)) ++ goto out; ++ AuDebugOn(au_test_anon(path.dentry)); ++ } else ++ path.dentry = dget(sb->s_root); ++ ++ path.mnt = au_mnt_get(sb); ++ dentry = au_lkup_by_ino(&path, ino, nsi_lock); ++ path_put(&path); ++ ++out: ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int h_acceptable(void *expv, struct dentry *dentry) ++{ ++ return 1; ++} ++ ++static char *au_build_path(struct dentry *h_parent, struct path *h_rootpath, ++ char *buf, int len, struct super_block *sb) ++{ ++ char *p; ++ int n; ++ struct path path; ++ ++ p = d_path(h_rootpath, buf, len); ++ if (IS_ERR(p)) ++ goto out; ++ n = strlen(p); ++ ++ path.mnt = h_rootpath->mnt; ++ path.dentry = h_parent; ++ p = d_path(&path, buf, len); ++ if (IS_ERR(p)) ++ goto out; ++ if (n != 1) ++ p += n; ++ ++ path.mnt = au_mnt_get(sb); ++ path.dentry = sb->s_root; ++ p = d_path(&path, buf, len - strlen(p)); ++ mntput(path.mnt); ++ if (IS_ERR(p)) ++ goto out; ++ if (n != 1) ++ p[strlen(p)] = '/'; ++ ++out: ++ AuTraceErrPtr(p); ++ return p; ++} ++ ++static ++struct dentry *decode_by_path(struct super_block *sb, ino_t ino, __u32 *fh, ++ int fh_len, struct au_nfsd_si_lock *nsi_lock) ++{ ++ struct dentry *dentry, *h_parent, *root; ++ struct super_block *h_sb; ++ char *pathname, *p; ++ struct vfsmount *h_mnt; ++ struct au_branch *br; ++ int err; ++ struct path path; ++ ++ br = au_sbr(sb, nsi_lock->bindex); ++ h_mnt = au_br_mnt(br); ++ h_sb = h_mnt->mnt_sb; ++ /* todo: call lower fh_to_dentry()? fh_to_parent()? */ ++ lockdep_off(); ++ h_parent = exportfs_decode_fh(h_mnt, (void *)(fh + Fh_tail), ++ fh_len - Fh_tail, fh[Fh_h_type], ++ h_acceptable, /*context*/NULL); ++ lockdep_on(); ++ dentry = h_parent; ++ if (unlikely(!h_parent || IS_ERR(h_parent))) { ++ AuWarn1("%s decode_fh failed, %ld\n", ++ au_sbtype(h_sb), PTR_ERR(h_parent)); ++ goto out; ++ } ++ dentry = NULL; ++ if (unlikely(au_test_anon(h_parent))) { ++ AuWarn1("%s decode_fh returned a disconnected dentry\n", ++ au_sbtype(h_sb)); ++ goto out_h_parent; ++ } ++ ++ dentry = ERR_PTR(-ENOMEM); ++ pathname = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!pathname)) ++ goto out_h_parent; ++ ++ root = sb->s_root; ++ path.mnt = h_mnt; ++ di_read_lock_parent(root, !AuLock_IR); ++ path.dentry = au_h_dptr(root, nsi_lock->bindex); ++ di_read_unlock(root, !AuLock_IR); ++ p = au_build_path(h_parent, &path, pathname, PAGE_SIZE, sb); ++ dentry = (void *)p; ++ if (IS_ERR(p)) ++ goto out_pathname; ++ ++ si_read_unlock(sb); ++ err = vfsub_kern_path(p, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); ++ dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_relock; ++ ++ dentry = ERR_PTR(-ENOENT); ++ AuDebugOn(au_test_anon(path.dentry)); ++ if (unlikely(d_really_is_negative(path.dentry))) ++ goto out_path; ++ ++ if (ino != d_inode(path.dentry)->i_ino) ++ dentry = au_lkup_by_ino(&path, ino, /*nsi_lock*/NULL); ++ else ++ dentry = dget(path.dentry); ++ ++out_path: ++ path_put(&path); ++out_relock: ++ if (unlikely(si_nfsd_read_lock(sb, nsi_lock) < 0)) ++ if (!IS_ERR(dentry)) { ++ dput(dentry); ++ dentry = ERR_PTR(-ESTALE); ++ } ++out_pathname: ++ free_page((unsigned long)pathname); ++out_h_parent: ++ dput(h_parent); ++out: ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct dentry * ++aufs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, ++ int fh_type) ++{ ++ struct dentry *dentry; ++ __u32 *fh = fid->raw; ++ struct au_branch *br; ++ ino_t ino, dir_ino; ++ struct au_nfsd_si_lock nsi_lock = { ++ .force_lock = 0 ++ }; ++ ++ dentry = ERR_PTR(-ESTALE); ++ /* it should never happen, but the file handle is unreliable */ ++ if (unlikely(fh_len < Fh_tail)) ++ goto out; ++ nsi_lock.sigen = fh[Fh_sigen]; ++ nsi_lock.br_id = fh[Fh_br_id]; ++ ++ /* branch id may be wrapped around */ ++ br = NULL; ++ if (unlikely(si_nfsd_read_lock(sb, &nsi_lock))) ++ goto out; ++ nsi_lock.force_lock = 1; ++ ++ /* is this inode still cached? */ ++ ino = decode_ino(fh + Fh_ino); ++ /* it should never happen */ ++ if (unlikely(ino == AUFS_ROOT_INO)) ++ goto out_unlock; ++ ++ dir_ino = decode_ino(fh + Fh_dir_ino); ++ dentry = decode_by_ino(sb, ino, dir_ino); ++ if (IS_ERR(dentry)) ++ goto out_unlock; ++ if (dentry) ++ goto accept; ++ ++ /* is the parent dir cached? */ ++ br = au_sbr(sb, nsi_lock.bindex); ++ au_br_get(br); ++ dentry = decode_by_dir_ino(sb, ino, dir_ino, &nsi_lock); ++ if (IS_ERR(dentry)) ++ goto out_unlock; ++ if (dentry) ++ goto accept; ++ ++ /* lookup path */ ++ dentry = decode_by_path(sb, ino, fh, fh_len, &nsi_lock); ++ if (IS_ERR(dentry)) ++ goto out_unlock; ++ if (unlikely(!dentry)) ++ /* todo?: make it ESTALE */ ++ goto out_unlock; ++ ++accept: ++ if (!au_digen_test(dentry, au_sigen(sb)) ++ && d_inode(dentry)->i_generation == fh[Fh_igen]) ++ goto out_unlock; /* success */ ++ ++ dput(dentry); ++ dentry = ERR_PTR(-ESTALE); ++out_unlock: ++ if (br) ++ au_br_put(br); ++ si_read_unlock(sb); ++out: ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++#if 0 /* reserved for future use */ ++/* support subtreecheck option */ ++static struct dentry *aufs_fh_to_parent(struct super_block *sb, struct fid *fid, ++ int fh_len, int fh_type) ++{ ++ struct dentry *parent; ++ __u32 *fh = fid->raw; ++ ino_t dir_ino; ++ ++ dir_ino = decode_ino(fh + Fh_dir_ino); ++ parent = decode_by_ino(sb, dir_ino, 0); ++ if (IS_ERR(parent)) ++ goto out; ++ if (!parent) ++ parent = decode_by_path(sb, au_br_index(sb, fh[Fh_br_id]), ++ dir_ino, fh, fh_len); ++ ++out: ++ AuTraceErrPtr(parent); ++ return parent; ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, ++ struct inode *dir) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ struct super_block *sb, *h_sb; ++ struct dentry *dentry, *parent, *h_parent; ++ struct inode *h_dir; ++ struct au_branch *br; ++ ++ err = -ENOSPC; ++ if (unlikely(*max_len <= Fh_tail)) { ++ AuWarn1("NFSv2 client (max_len %d)?\n", *max_len); ++ goto out; ++ } ++ ++ err = FILEID_ROOT; ++ if (inode->i_ino == AUFS_ROOT_INO) { ++ AuDebugOn(inode->i_ino != AUFS_ROOT_INO); ++ goto out; ++ } ++ ++ h_parent = NULL; ++ sb = inode->i_sb; ++ err = si_read_lock(sb, AuLock_FLUSH); ++ if (unlikely(err)) ++ goto out; ++ ++#ifdef CONFIG_AUFS_DEBUG ++ if (unlikely(!au_opt_test(au_mntflags(sb), XINO))) ++ AuWarn1("NFS-exporting requires xino\n"); ++#endif ++ err = -EIO; ++ parent = NULL; ++ ii_read_lock_child(inode); ++ bindex = au_ibtop(inode); ++ if (!dir) { ++ dentry = d_find_any_alias(inode); ++ if (unlikely(!dentry)) ++ goto out_unlock; ++ AuDebugOn(au_test_anon(dentry)); ++ parent = dget_parent(dentry); ++ dput(dentry); ++ if (unlikely(!parent)) ++ goto out_unlock; ++ if (d_really_is_positive(parent)) ++ dir = d_inode(parent); ++ } ++ ++ ii_read_lock_parent(dir); ++ h_dir = au_h_iptr(dir, bindex); ++ ii_read_unlock(dir); ++ if (unlikely(!h_dir)) ++ goto out_parent; ++ h_parent = d_find_any_alias(h_dir); ++ if (unlikely(!h_parent)) ++ goto out_hparent; ++ ++ err = -EPERM; ++ br = au_sbr(sb, bindex); ++ h_sb = au_br_sb(br); ++ if (unlikely(!h_sb->s_export_op)) { ++ AuErr1("%s branch is not exportable\n", au_sbtype(h_sb)); ++ goto out_hparent; ++ } ++ ++ fh[Fh_br_id] = br->br_id; ++ fh[Fh_sigen] = au_sigen(sb); ++ encode_ino(fh + Fh_ino, inode->i_ino); ++ encode_ino(fh + Fh_dir_ino, dir->i_ino); ++ fh[Fh_igen] = inode->i_generation; ++ ++ *max_len -= Fh_tail; ++ fh[Fh_h_type] = exportfs_encode_fh(h_parent, (void *)(fh + Fh_tail), ++ max_len, ++ /*connectable or subtreecheck*/0); ++ err = fh[Fh_h_type]; ++ *max_len += Fh_tail; ++ /* todo: macros? */ ++ if (err != FILEID_INVALID) ++ err = 99; ++ else ++ AuWarn1("%s encode_fh failed\n", au_sbtype(h_sb)); ++ ++out_hparent: ++ dput(h_parent); ++out_parent: ++ dput(parent); ++out_unlock: ++ ii_read_unlock(inode); ++ si_read_unlock(sb); ++out: ++ if (unlikely(err < 0)) ++ err = FILEID_INVALID; ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_commit_metadata(struct inode *inode) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ struct super_block *sb; ++ struct inode *h_inode; ++ int (*f)(struct inode *inode); ++ ++ sb = inode->i_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ ii_write_lock_child(inode); ++ bindex = au_ibtop(inode); ++ AuDebugOn(bindex < 0); ++ h_inode = au_h_iptr(inode, bindex); ++ ++ f = h_inode->i_sb->s_export_op->commit_metadata; ++ if (f) ++ err = f(h_inode); ++ else { ++ struct writeback_control wbc = { ++ .sync_mode = WB_SYNC_ALL, ++ .nr_to_write = 0 /* metadata only */ ++ }; ++ ++ err = sync_inode(h_inode, &wbc); ++ } ++ ++ au_cpup_attr_timesizes(inode); ++ ii_write_unlock(inode); ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct export_operations aufs_export_op = { ++ .fh_to_dentry = aufs_fh_to_dentry, ++ /* .fh_to_parent = aufs_fh_to_parent, */ ++ .encode_fh = aufs_encode_fh, ++ .commit_metadata = aufs_commit_metadata ++}; ++ ++void au_export_init(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ __u32 u; ++ ++ BUILD_BUG_ON_MSG(IS_BUILTIN(CONFIG_AUFS_FS) ++ && IS_MODULE(CONFIG_EXPORTFS), ++ AUFS_NAME ": unsupported configuration " ++ "CONFIG_EXPORTFS=m and CONFIG_AUFS_FS=y"); ++ ++ sb->s_export_op = &aufs_export_op; ++ sbinfo = au_sbi(sb); ++ sbinfo->si_xigen = NULL; ++ get_random_bytes(&u, sizeof(u)); ++ BUILD_BUG_ON(sizeof(u) != sizeof(int)); ++ atomic_set(&sbinfo->si_xigen_next, u); ++} +diff --git a/fs/aufs/f_op.c b/fs/aufs/f_op.c +new file mode 100644 +index 0000000..0b54eef +--- /dev/null ++++ b/fs/aufs/f_op.c +@@ -0,0 +1,770 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * file and vm operations ++ */ ++ ++#include ++#include ++#include ++#include ++#include "aufs.h" ++ ++int au_do_open_nondir(struct file *file, int flags, struct file *h_file) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ struct dentry *dentry, *h_dentry; ++ struct au_finfo *finfo; ++ struct inode *h_inode; ++ ++ FiMustWriteLock(file); ++ ++ err = 0; ++ dentry = file->f_path.dentry; ++ AuDebugOn(IS_ERR_OR_NULL(dentry)); ++ finfo = au_fi(file); ++ memset(&finfo->fi_htop, 0, sizeof(finfo->fi_htop)); ++ atomic_set(&finfo->fi_mmapped, 0); ++ bindex = au_dbtop(dentry); ++ if (!h_file) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ err = vfsub_test_mntns(file->f_path.mnt, h_dentry->d_sb); ++ if (unlikely(err)) ++ goto out; ++ h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0); ++ } else { ++ h_dentry = h_file->f_path.dentry; ++ err = vfsub_test_mntns(file->f_path.mnt, h_dentry->d_sb); ++ if (unlikely(err)) ++ goto out; ++ get_file(h_file); ++ } ++ if (IS_ERR(h_file)) ++ err = PTR_ERR(h_file); ++ else { ++ if ((flags & __O_TMPFILE) ++ && !(flags & O_EXCL)) { ++ h_inode = file_inode(h_file); ++ spin_lock(&h_inode->i_lock); ++ h_inode->i_state |= I_LINKABLE; ++ spin_unlock(&h_inode->i_lock); ++ } ++ au_set_fbtop(file, bindex); ++ au_set_h_fptr(file, bindex, h_file); ++ au_update_figen(file); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ } ++ ++out: ++ return err; ++} ++ ++static int aufs_open_nondir(struct inode *inode __maybe_unused, ++ struct file *file) ++{ ++ int err; ++ struct super_block *sb; ++ struct au_do_open_args args = { ++ .open = au_do_open_nondir ++ }; ++ ++ AuDbg("%pD, f_flags 0x%x, f_mode 0x%x\n", ++ file, vfsub_file_flags(file), file->f_mode); ++ ++ sb = file->f_path.dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ err = au_do_open(file, &args); ++ si_read_unlock(sb); ++ return err; ++} ++ ++int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file) ++{ ++ struct au_finfo *finfo; ++ aufs_bindex_t bindex; ++ ++ finfo = au_fi(file); ++ au_sphl_del(&finfo->fi_hlist, ++ &au_sbi(file->f_path.dentry->d_sb)->si_files); ++ bindex = finfo->fi_btop; ++ if (bindex >= 0) ++ au_set_h_fptr(file, bindex, NULL); ++ ++ au_finfo_fin(file); ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_do_flush_nondir(struct file *file, fl_owner_t id) ++{ ++ int err; ++ struct file *h_file; ++ ++ err = 0; ++ h_file = au_hf_top(file); ++ if (h_file) ++ err = vfsub_flush(h_file, id); ++ return err; ++} ++ ++static int aufs_flush_nondir(struct file *file, fl_owner_t id) ++{ ++ return au_do_flush(file, id, au_do_flush_nondir); ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * read and write functions acquire [fdi]_rwsem once, but release before ++ * mmap_sem. This is because to stop a race condition between mmap(2). ++ * Releasing these aufs-rwsem should be safe, no branch-mamagement (by keeping ++ * si_rwsem), no harmful copy-up should happen. Actually copy-up may happen in ++ * read functions after [fdi]_rwsem are released, but it should be harmless. ++ */ ++ ++/* Callers should call au_read_post() or fput() in the end */ ++struct file *au_read_pre(struct file *file, int keep_fi) ++{ ++ struct file *h_file; ++ int err; ++ ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); ++ if (!err) { ++ di_read_unlock(file->f_path.dentry, AuLock_IR); ++ h_file = au_hf_top(file); ++ get_file(h_file); ++ if (!keep_fi) ++ fi_read_unlock(file); ++ } else ++ h_file = ERR_PTR(err); ++ ++ return h_file; ++} ++ ++static void au_read_post(struct inode *inode, struct file *h_file) ++{ ++ /* update without lock, I don't think it a problem */ ++ fsstack_copy_attr_atime(inode, file_inode(h_file)); ++ fput(h_file); ++} ++ ++struct au_write_pre { ++ blkcnt_t blks; ++ aufs_bindex_t btop; ++}; ++ ++/* ++ * return with iinfo is write-locked ++ * callers should call au_write_post() or iinfo_write_unlock() + fput() in the ++ * end ++ */ ++static struct file *au_write_pre(struct file *file, int do_ready, ++ struct au_write_pre *wpre) ++{ ++ struct file *h_file; ++ struct dentry *dentry; ++ int err; ++ struct au_pin pin; ++ ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ h_file = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ ++ dentry = file->f_path.dentry; ++ if (do_ready) { ++ err = au_ready_to_write(file, -1, &pin); ++ if (unlikely(err)) { ++ h_file = ERR_PTR(err); ++ di_write_unlock(dentry); ++ goto out_fi; ++ } ++ } ++ ++ di_downgrade_lock(dentry, /*flags*/0); ++ if (wpre) ++ wpre->btop = au_fbtop(file); ++ h_file = au_hf_top(file); ++ get_file(h_file); ++ if (wpre) ++ wpre->blks = file_inode(h_file)->i_blocks; ++ if (do_ready) ++ au_unpin(&pin); ++ di_read_unlock(dentry, /*flags*/0); ++ ++out_fi: ++ fi_write_unlock(file); ++out: ++ return h_file; ++} ++ ++static void au_write_post(struct inode *inode, struct file *h_file, ++ struct au_write_pre *wpre, ssize_t written) ++{ ++ struct inode *h_inode; ++ ++ au_cpup_attr_timesizes(inode); ++ AuDebugOn(au_ibtop(inode) != wpre->btop); ++ h_inode = file_inode(h_file); ++ inode->i_mode = h_inode->i_mode; ++ ii_write_unlock(inode); ++ fput(h_file); ++ ++ /* AuDbg("blks %llu, %llu\n", (u64)blks, (u64)h_inode->i_blocks); */ ++ if (written > 0) ++ au_fhsm_wrote(inode->i_sb, wpre->btop, ++ /*force*/h_inode->i_blocks > wpre->blks); ++} ++ ++static ssize_t aufs_read(struct file *file, char __user *buf, size_t count, ++ loff_t *ppos) ++{ ++ ssize_t err; ++ struct inode *inode; ++ struct file *h_file; ++ struct super_block *sb; ++ ++ inode = file_inode(file); ++ sb = inode->i_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ ++ h_file = au_read_pre(file, /*keep_fi*/0); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; ++ ++ /* filedata may be obsoleted by concurrent copyup, but no problem */ ++ err = vfsub_read_u(h_file, buf, count, ppos); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ au_read_post(inode, h_file); ++ ++out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ++ * todo: very ugly ++ * it locks both of i_mutex and si_rwsem for read in safe. ++ * if the plink maintenance mode continues forever (that is the problem), ++ * may loop forever. ++ */ ++static void au_mtx_and_read_lock(struct inode *inode) ++{ ++ int err; ++ struct super_block *sb = inode->i_sb; ++ ++ while (1) { ++ inode_lock(inode); ++ err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (!err) ++ break; ++ inode_unlock(inode); ++ si_read_lock(sb, AuLock_NOPLMW); ++ si_read_unlock(sb); ++ } ++} ++ ++static ssize_t aufs_write(struct file *file, const char __user *ubuf, ++ size_t count, loff_t *ppos) ++{ ++ ssize_t err; ++ struct au_write_pre wpre; ++ struct inode *inode; ++ struct file *h_file; ++ char __user *buf = (char __user *)ubuf; ++ ++ inode = file_inode(file); ++ au_mtx_and_read_lock(inode); ++ ++ h_file = au_write_pre(file, /*do_ready*/1, &wpre); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; ++ ++ err = vfsub_write_u(h_file, buf, count, ppos); ++ au_write_post(inode, h_file, &wpre, err); ++ ++out: ++ si_read_unlock(inode->i_sb); ++ inode_unlock(inode); ++ return err; ++} ++ ++static ssize_t au_do_iter(struct file *h_file, int rw, struct kiocb *kio, ++ struct iov_iter *iov_iter) ++{ ++ ssize_t err; ++ struct file *file; ++ ssize_t (*iter)(struct kiocb *, struct iov_iter *); ++ ++ err = security_file_permission(h_file, rw); ++ if (unlikely(err)) ++ goto out; ++ ++ err = -ENOSYS; ++ iter = NULL; ++ if (rw == MAY_READ) ++ iter = h_file->f_op->read_iter; ++ else if (rw == MAY_WRITE) ++ iter = h_file->f_op->write_iter; ++ ++ file = kio->ki_filp; ++ kio->ki_filp = h_file; ++ if (iter) { ++ lockdep_off(); ++ err = iter(kio, iov_iter); ++ lockdep_on(); ++ } else ++ /* currently there is no such fs */ ++ WARN_ON_ONCE(1); ++ kio->ki_filp = file; ++ ++out: ++ return err; ++} ++ ++static ssize_t aufs_read_iter(struct kiocb *kio, struct iov_iter *iov_iter) ++{ ++ ssize_t err; ++ struct file *file, *h_file; ++ struct inode *inode; ++ struct super_block *sb; ++ ++ file = kio->ki_filp; ++ inode = file_inode(file); ++ sb = inode->i_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ ++ h_file = au_read_pre(file, /*keep_fi*/0); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; ++ ++ err = au_do_iter(h_file, MAY_READ, kio, iov_iter); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ au_read_post(inode, h_file); ++ ++out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++static ssize_t aufs_write_iter(struct kiocb *kio, struct iov_iter *iov_iter) ++{ ++ ssize_t err; ++ struct au_write_pre wpre; ++ struct inode *inode; ++ struct file *file, *h_file; ++ ++ file = kio->ki_filp; ++ inode = file_inode(file); ++ au_mtx_and_read_lock(inode); ++ ++ h_file = au_write_pre(file, /*do_ready*/1, &wpre); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; ++ ++ err = au_do_iter(h_file, MAY_WRITE, kio, iov_iter); ++ au_write_post(inode, h_file, &wpre, err); ++ ++out: ++ si_read_unlock(inode->i_sb); ++ inode_unlock(inode); ++ return err; ++} ++ ++static ssize_t aufs_splice_read(struct file *file, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags) ++{ ++ ssize_t err; ++ struct file *h_file; ++ struct inode *inode; ++ struct super_block *sb; ++ ++ inode = file_inode(file); ++ sb = inode->i_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ ++ h_file = au_read_pre(file, /*keep_fi*/1); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; ++ ++ if (au_test_loopback_kthread()) { ++ au_warn_loopback(h_file->f_path.dentry->d_sb); ++ if (file->f_mapping != h_file->f_mapping) { ++ file->f_mapping = h_file->f_mapping; ++ smp_mb(); /* unnecessary? */ ++ } ++ } ++ fi_read_unlock(file); ++ ++ err = vfsub_splice_to(h_file, ppos, pipe, len, flags); ++ /* todo: necessasry? */ ++ /* file->f_ra = h_file->f_ra; */ ++ au_read_post(inode, h_file); ++ ++out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++static ssize_t ++aufs_splice_write(struct pipe_inode_info *pipe, struct file *file, loff_t *ppos, ++ size_t len, unsigned int flags) ++{ ++ ssize_t err; ++ struct au_write_pre wpre; ++ struct inode *inode; ++ struct file *h_file; ++ ++ inode = file_inode(file); ++ au_mtx_and_read_lock(inode); ++ ++ h_file = au_write_pre(file, /*do_ready*/1, &wpre); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; ++ ++ err = vfsub_splice_from(pipe, h_file, ppos, len, flags); ++ au_write_post(inode, h_file, &wpre, err); ++ ++out: ++ si_read_unlock(inode->i_sb); ++ inode_unlock(inode); ++ return err; ++} ++ ++static long aufs_fallocate(struct file *file, int mode, loff_t offset, ++ loff_t len) ++{ ++ long err; ++ struct au_write_pre wpre; ++ struct inode *inode; ++ struct file *h_file; ++ ++ inode = file_inode(file); ++ au_mtx_and_read_lock(inode); ++ ++ h_file = au_write_pre(file, /*do_ready*/1, &wpre); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; ++ ++ lockdep_off(); ++ err = vfs_fallocate(h_file, mode, offset, len); ++ lockdep_on(); ++ au_write_post(inode, h_file, &wpre, /*written*/1); ++ ++out: ++ si_read_unlock(inode->i_sb); ++ inode_unlock(inode); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * The locking order around current->mmap_sem. ++ * - in most and regular cases ++ * file I/O syscall -- aufs_read() or something ++ * -- si_rwsem for read -- mmap_sem ++ * (Note that [fdi]i_rwsem are released before mmap_sem). ++ * - in mmap case ++ * mmap(2) -- mmap_sem -- aufs_mmap() -- si_rwsem for read -- [fdi]i_rwsem ++ * This AB-BA order is definitly bad, but is not a problem since "si_rwsem for ++ * read" allows muliple processes to acquire it and [fdi]i_rwsem are not held in ++ * file I/O. Aufs needs to stop lockdep in aufs_mmap() though. ++ * It means that when aufs acquires si_rwsem for write, the process should never ++ * acquire mmap_sem. ++ * ++ * Actually aufs_iterate() holds [fdi]i_rwsem before mmap_sem, but this is not a ++ * problem either since any directory is not able to be mmap-ed. ++ * The similar scenario is applied to aufs_readlink() too. ++ */ ++ ++#if 0 /* stop calling security_file_mmap() */ ++/* cf. linux/include/linux/mman.h: calc_vm_prot_bits() */ ++#define AuConv_VM_PROT(f, b) _calc_vm_trans(f, VM_##b, PROT_##b) ++ ++static unsigned long au_arch_prot_conv(unsigned long flags) ++{ ++ /* currently ppc64 only */ ++#ifdef CONFIG_PPC64 ++ /* cf. linux/arch/powerpc/include/asm/mman.h */ ++ AuDebugOn(arch_calc_vm_prot_bits(-1) != VM_SAO); ++ return AuConv_VM_PROT(flags, SAO); ++#else ++ AuDebugOn(arch_calc_vm_prot_bits(-1)); ++ return 0; ++#endif ++} ++ ++static unsigned long au_prot_conv(unsigned long flags) ++{ ++ return AuConv_VM_PROT(flags, READ) ++ | AuConv_VM_PROT(flags, WRITE) ++ | AuConv_VM_PROT(flags, EXEC) ++ | au_arch_prot_conv(flags); ++} ++ ++/* cf. linux/include/linux/mman.h: calc_vm_flag_bits() */ ++#define AuConv_VM_MAP(f, b) _calc_vm_trans(f, VM_##b, MAP_##b) ++ ++static unsigned long au_flag_conv(unsigned long flags) ++{ ++ return AuConv_VM_MAP(flags, GROWSDOWN) ++ | AuConv_VM_MAP(flags, DENYWRITE) ++ | AuConv_VM_MAP(flags, LOCKED); ++} ++#endif ++ ++static int aufs_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ int err; ++ const unsigned char wlock ++ = (file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED); ++ struct super_block *sb; ++ struct file *h_file; ++ struct inode *inode; ++ ++ AuDbgVmRegion(file, vma); ++ ++ inode = file_inode(file); ++ sb = inode->i_sb; ++ lockdep_off(); ++ si_read_lock(sb, AuLock_NOPLMW); ++ ++ h_file = au_write_pre(file, wlock, /*wpre*/NULL); ++ lockdep_on(); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; ++ ++ err = 0; ++ au_set_mmapped(file); ++ au_vm_file_reset(vma, h_file); ++ /* ++ * we cannot call security_mmap_file() here since it may acquire ++ * mmap_sem or i_mutex. ++ * ++ * err = security_mmap_file(h_file, au_prot_conv(vma->vm_flags), ++ * au_flag_conv(vma->vm_flags)); ++ */ ++ if (!err) ++ err = h_file->f_op->mmap(h_file, vma); ++ if (!err) { ++ au_vm_prfile_set(vma, file); ++ fsstack_copy_attr_atime(inode, file_inode(h_file)); ++ goto out_fput; /* success */ ++ } ++ au_unset_mmapped(file); ++ au_vm_file_reset(vma, file); ++ ++out_fput: ++ lockdep_off(); ++ ii_write_unlock(inode); ++ lockdep_on(); ++ fput(h_file); ++out: ++ lockdep_off(); ++ si_read_unlock(sb); ++ lockdep_on(); ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_fsync_nondir(struct file *file, loff_t start, loff_t end, ++ int datasync) ++{ ++ int err; ++ struct au_write_pre wpre; ++ struct inode *inode; ++ struct file *h_file; ++ ++ err = 0; /* -EBADF; */ /* posix? */ ++ if (unlikely(!(file->f_mode & FMODE_WRITE))) ++ goto out; ++ ++ inode = file_inode(file); ++ au_mtx_and_read_lock(inode); ++ ++ h_file = au_write_pre(file, /*do_ready*/1, &wpre); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out_unlock; ++ ++ err = vfsub_fsync(h_file, &h_file->f_path, datasync); ++ au_write_post(inode, h_file, &wpre, /*written*/0); ++ ++out_unlock: ++ si_read_unlock(inode->i_sb); ++ inode_unlock(inode); ++out: ++ return err; ++} ++ ++/* no one supports this operation, currently */ ++#if 0 ++static int aufs_aio_fsync_nondir(struct kiocb *kio, int datasync) ++{ ++ int err; ++ struct au_write_pre wpre; ++ struct inode *inode, *h_inode; ++ struct file *file, *h_file; ++ ++ err = 0; /* -EBADF; */ /* posix? */ ++ if (unlikely(!(file->f_mode & FMODE_WRITE))) ++ goto out; ++ ++ file = kio->ki_filp; ++ inode = file_inode(file); ++ au_mtx_and_read_lock(inode); ++ ++ h_file = au_write_pre(file, /*do_ready*/1, &wpre); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out_unlock; ++ ++ err = -ENOSYS; ++ h_file = au_hf_top(file); ++ if (h_file->f_op->aio_fsync) { ++ h_inode = file_inode(h_file); ++ if (!is_sync_kiocb(kio)) { ++ get_file(h_file); ++ fput(file); ++ } ++ kio->ki_filp = h_file; ++ err = h_file->f_op->aio_fsync(kio, datasync); ++ inode_lock_nested(h_inode, AuLsc_I_CHILD); ++ if (!err) ++ vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); ++ /*ignore*/ ++ inode_unlock(h_inode); ++ } ++ au_write_post(inode, h_file, &wpre, /*written*/0); ++ ++out_unlock: ++ si_read_unlock(inode->sb); ++ inode_unlock(inode); ++out: ++ return err; ++} ++#endif ++ ++static int aufs_fasync(int fd, struct file *file, int flag) ++{ ++ int err; ++ struct file *h_file; ++ struct super_block *sb; ++ ++ sb = file->f_path.dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ ++ h_file = au_read_pre(file, /*keep_fi*/0); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; ++ ++ if (h_file->f_op->fasync) ++ err = h_file->f_op->fasync(fd, h_file, flag); ++ fput(h_file); /* instead of au_read_post() */ ++ ++out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++static int aufs_setfl(struct file *file, unsigned long arg) ++{ ++ int err; ++ struct file *h_file; ++ struct super_block *sb; ++ ++ sb = file->f_path.dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ ++ h_file = au_read_pre(file, /*keep_fi*/0); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; ++ ++ arg |= vfsub_file_flags(file) & FASYNC; /* stop calling h_file->fasync */ ++ err = setfl(/*unused fd*/-1, h_file, arg); ++ fput(h_file); /* instead of au_read_post() */ ++ ++out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* no one supports this operation, currently */ ++#if 0 ++static ssize_t aufs_sendpage(struct file *file, struct page *page, int offset, ++ size_t len, loff_t *pos, int more) ++{ ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++const struct file_operations aufs_file_fop = { ++ .owner = THIS_MODULE, ++ ++ .llseek = default_llseek, ++ ++ .read = aufs_read, ++ .write = aufs_write, ++ .read_iter = aufs_read_iter, ++ .write_iter = aufs_write_iter, ++ ++#ifdef CONFIG_AUFS_POLL ++ .poll = aufs_poll, ++#endif ++ .unlocked_ioctl = aufs_ioctl_nondir, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = aufs_compat_ioctl_nondir, ++#endif ++ .mmap = aufs_mmap, ++ .open = aufs_open_nondir, ++ .flush = aufs_flush_nondir, ++ .release = aufs_release_nondir, ++ .fsync = aufs_fsync_nondir, ++ /* .aio_fsync = aufs_aio_fsync_nondir, */ ++ .fasync = aufs_fasync, ++ /* .sendpage = aufs_sendpage, */ ++ .setfl = aufs_setfl, ++ .splice_write = aufs_splice_write, ++ .splice_read = aufs_splice_read, ++#if 0 ++ .aio_splice_write = aufs_aio_splice_write, ++ .aio_splice_read = aufs_aio_splice_read, ++#endif ++ .fallocate = aufs_fallocate ++}; +diff --git a/fs/aufs/fhsm.c b/fs/aufs/fhsm.c +new file mode 100644 +index 0000000..40289e4 +--- /dev/null ++++ b/fs/aufs/fhsm.c +@@ -0,0 +1,426 @@ ++/* ++ * Copyright (C) 2011-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * File-based Hierarchy Storage Management ++ */ ++ ++#include ++#include ++#include ++#include ++#include "aufs.h" ++ ++static aufs_bindex_t au_fhsm_bottom(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ struct au_fhsm *fhsm; ++ ++ SiMustAnyLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ fhsm = &sbinfo->si_fhsm; ++ AuDebugOn(!fhsm); ++ return fhsm->fhsm_bottom; ++} ++ ++void au_fhsm_set_bottom(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ struct au_sbinfo *sbinfo; ++ struct au_fhsm *fhsm; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ fhsm = &sbinfo->si_fhsm; ++ AuDebugOn(!fhsm); ++ fhsm->fhsm_bottom = bindex; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_fhsm_test_jiffy(struct au_sbinfo *sbinfo, struct au_branch *br) ++{ ++ struct au_br_fhsm *bf; ++ ++ bf = br->br_fhsm; ++ MtxMustLock(&bf->bf_lock); ++ ++ return !bf->bf_readable ++ || time_after(jiffies, ++ bf->bf_jiffy + sbinfo->si_fhsm.fhsm_expire); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_fhsm_notify(struct super_block *sb, int val) ++{ ++ struct au_sbinfo *sbinfo; ++ struct au_fhsm *fhsm; ++ ++ SiMustAnyLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ fhsm = &sbinfo->si_fhsm; ++ if (au_fhsm_pid(fhsm) ++ && atomic_read(&fhsm->fhsm_readable) != -1) { ++ atomic_set(&fhsm->fhsm_readable, val); ++ if (val) ++ wake_up(&fhsm->fhsm_wqh); ++ } ++} ++ ++static int au_fhsm_stfs(struct super_block *sb, aufs_bindex_t bindex, ++ struct aufs_stfs *rstfs, int do_lock, int do_notify) ++{ ++ int err; ++ struct au_branch *br; ++ struct au_br_fhsm *bf; ++ ++ br = au_sbr(sb, bindex); ++ AuDebugOn(au_br_rdonly(br)); ++ bf = br->br_fhsm; ++ AuDebugOn(!bf); ++ ++ if (do_lock) ++ mutex_lock(&bf->bf_lock); ++ else ++ MtxMustLock(&bf->bf_lock); ++ ++ /* sb->s_root for NFS is unreliable */ ++ err = au_br_stfs(br, &bf->bf_stfs); ++ if (unlikely(err)) { ++ AuErr1("FHSM failed (%d), b%d, ignored.\n", bindex, err); ++ goto out; ++ } ++ ++ bf->bf_jiffy = jiffies; ++ bf->bf_readable = 1; ++ if (do_notify) ++ au_fhsm_notify(sb, /*val*/1); ++ if (rstfs) ++ *rstfs = bf->bf_stfs; ++ ++out: ++ if (do_lock) ++ mutex_unlock(&bf->bf_lock); ++ au_fhsm_notify(sb, /*val*/1); ++ ++ return err; ++} ++ ++void au_fhsm_wrote(struct super_block *sb, aufs_bindex_t bindex, int force) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ struct au_fhsm *fhsm; ++ struct au_branch *br; ++ struct au_br_fhsm *bf; ++ ++ AuDbg("b%d, force %d\n", bindex, force); ++ SiMustAnyLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ fhsm = &sbinfo->si_fhsm; ++ if (!au_ftest_si(sbinfo, FHSM) ++ || fhsm->fhsm_bottom == bindex) ++ return; ++ ++ br = au_sbr(sb, bindex); ++ bf = br->br_fhsm; ++ AuDebugOn(!bf); ++ mutex_lock(&bf->bf_lock); ++ if (force ++ || au_fhsm_pid(fhsm) ++ || au_fhsm_test_jiffy(sbinfo, br)) ++ err = au_fhsm_stfs(sb, bindex, /*rstfs*/NULL, /*do_lock*/0, ++ /*do_notify*/1); ++ mutex_unlock(&bf->bf_lock); ++} ++ ++void au_fhsm_wrote_all(struct super_block *sb, int force) ++{ ++ aufs_bindex_t bindex, bbot; ++ struct au_branch *br; ++ ++ /* exclude the bottom */ ++ bbot = au_fhsm_bottom(sb); ++ for (bindex = 0; bindex < bbot; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (au_br_fhsm(br->br_perm)) ++ au_fhsm_wrote(sb, bindex, force); ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static unsigned int au_fhsm_poll(struct file *file, ++ struct poll_table_struct *wait) ++{ ++ unsigned int mask; ++ struct au_sbinfo *sbinfo; ++ struct au_fhsm *fhsm; ++ ++ mask = 0; ++ sbinfo = file->private_data; ++ fhsm = &sbinfo->si_fhsm; ++ poll_wait(file, &fhsm->fhsm_wqh, wait); ++ if (atomic_read(&fhsm->fhsm_readable)) ++ mask = POLLIN /* | POLLRDNORM */; ++ ++ AuTraceErr((int)mask); ++ return mask; ++} ++ ++static int au_fhsm_do_read_one(struct aufs_stbr __user *stbr, ++ struct aufs_stfs *stfs, __s16 brid) ++{ ++ int err; ++ ++ err = copy_to_user(&stbr->stfs, stfs, sizeof(*stfs)); ++ if (!err) ++ err = __put_user(brid, &stbr->brid); ++ if (unlikely(err)) ++ err = -EFAULT; ++ ++ return err; ++} ++ ++static ssize_t au_fhsm_do_read(struct super_block *sb, ++ struct aufs_stbr __user *stbr, size_t count) ++{ ++ ssize_t err; ++ int nstbr; ++ aufs_bindex_t bindex, bbot; ++ struct au_branch *br; ++ struct au_br_fhsm *bf; ++ ++ /* except the bottom branch */ ++ err = 0; ++ nstbr = 0; ++ bbot = au_fhsm_bottom(sb); ++ for (bindex = 0; !err && bindex < bbot; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (!au_br_fhsm(br->br_perm)) ++ continue; ++ ++ bf = br->br_fhsm; ++ mutex_lock(&bf->bf_lock); ++ if (bf->bf_readable) { ++ err = -EFAULT; ++ if (count >= sizeof(*stbr)) ++ err = au_fhsm_do_read_one(stbr++, &bf->bf_stfs, ++ br->br_id); ++ if (!err) { ++ bf->bf_readable = 0; ++ count -= sizeof(*stbr); ++ nstbr++; ++ } ++ } ++ mutex_unlock(&bf->bf_lock); ++ } ++ if (!err) ++ err = sizeof(*stbr) * nstbr; ++ ++ return err; ++} ++ ++static ssize_t au_fhsm_read(struct file *file, char __user *buf, size_t count, ++ loff_t *pos) ++{ ++ ssize_t err; ++ int readable; ++ aufs_bindex_t nfhsm, bindex, bbot; ++ struct au_sbinfo *sbinfo; ++ struct au_fhsm *fhsm; ++ struct au_branch *br; ++ struct super_block *sb; ++ ++ err = 0; ++ sbinfo = file->private_data; ++ fhsm = &sbinfo->si_fhsm; ++need_data: ++ spin_lock_irq(&fhsm->fhsm_wqh.lock); ++ if (!atomic_read(&fhsm->fhsm_readable)) { ++ if (vfsub_file_flags(file) & O_NONBLOCK) ++ err = -EAGAIN; ++ else ++ err = wait_event_interruptible_locked_irq ++ (fhsm->fhsm_wqh, ++ atomic_read(&fhsm->fhsm_readable)); ++ } ++ spin_unlock_irq(&fhsm->fhsm_wqh.lock); ++ if (unlikely(err)) ++ goto out; ++ ++ /* sb may already be dead */ ++ au_rw_read_lock(&sbinfo->si_rwsem); ++ readable = atomic_read(&fhsm->fhsm_readable); ++ if (readable > 0) { ++ sb = sbinfo->si_sb; ++ AuDebugOn(!sb); ++ /* exclude the bottom branch */ ++ nfhsm = 0; ++ bbot = au_fhsm_bottom(sb); ++ for (bindex = 0; bindex < bbot; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (au_br_fhsm(br->br_perm)) ++ nfhsm++; ++ } ++ err = -EMSGSIZE; ++ if (nfhsm * sizeof(struct aufs_stbr) <= count) { ++ atomic_set(&fhsm->fhsm_readable, 0); ++ err = au_fhsm_do_read(sbinfo->si_sb, (void __user *)buf, ++ count); ++ } ++ } ++ au_rw_read_unlock(&sbinfo->si_rwsem); ++ if (!readable) ++ goto need_data; ++ ++out: ++ return err; ++} ++ ++static int au_fhsm_release(struct inode *inode, struct file *file) ++{ ++ struct au_sbinfo *sbinfo; ++ struct au_fhsm *fhsm; ++ ++ /* sb may already be dead */ ++ sbinfo = file->private_data; ++ fhsm = &sbinfo->si_fhsm; ++ spin_lock(&fhsm->fhsm_spin); ++ fhsm->fhsm_pid = 0; ++ spin_unlock(&fhsm->fhsm_spin); ++ kobject_put(&sbinfo->si_kobj); ++ ++ return 0; ++} ++ ++static const struct file_operations au_fhsm_fops = { ++ .owner = THIS_MODULE, ++ .llseek = noop_llseek, ++ .read = au_fhsm_read, ++ .poll = au_fhsm_poll, ++ .release = au_fhsm_release ++}; ++ ++int au_fhsm_fd(struct super_block *sb, int oflags) ++{ ++ int err, fd; ++ struct au_sbinfo *sbinfo; ++ struct au_fhsm *fhsm; ++ ++ err = -EPERM; ++ if (unlikely(!capable(CAP_SYS_ADMIN))) ++ goto out; ++ ++ err = -EINVAL; ++ if (unlikely(oflags & ~(O_CLOEXEC | O_NONBLOCK))) ++ goto out; ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ fhsm = &sbinfo->si_fhsm; ++ spin_lock(&fhsm->fhsm_spin); ++ if (!fhsm->fhsm_pid) ++ fhsm->fhsm_pid = current->pid; ++ else ++ err = -EBUSY; ++ spin_unlock(&fhsm->fhsm_spin); ++ if (unlikely(err)) ++ goto out; ++ ++ oflags |= O_RDONLY; ++ /* oflags |= FMODE_NONOTIFY; */ ++ fd = anon_inode_getfd("[aufs_fhsm]", &au_fhsm_fops, sbinfo, oflags); ++ err = fd; ++ if (unlikely(fd < 0)) ++ goto out_pid; ++ ++ /* succeed reglardless 'fhsm' status */ ++ kobject_get(&sbinfo->si_kobj); ++ si_noflush_read_lock(sb); ++ if (au_ftest_si(sbinfo, FHSM)) ++ au_fhsm_wrote_all(sb, /*force*/0); ++ si_read_unlock(sb); ++ goto out; /* success */ ++ ++out_pid: ++ spin_lock(&fhsm->fhsm_spin); ++ fhsm->fhsm_pid = 0; ++ spin_unlock(&fhsm->fhsm_spin); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_fhsm_br_alloc(struct au_branch *br) ++{ ++ int err; ++ ++ err = 0; ++ br->br_fhsm = kmalloc(sizeof(*br->br_fhsm), GFP_NOFS); ++ if (br->br_fhsm) ++ au_br_fhsm_init(br->br_fhsm); ++ else ++ err = -ENOMEM; ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_fhsm_fin(struct super_block *sb) ++{ ++ au_fhsm_notify(sb, /*val*/-1); ++} ++ ++void au_fhsm_init(struct au_sbinfo *sbinfo) ++{ ++ struct au_fhsm *fhsm; ++ ++ fhsm = &sbinfo->si_fhsm; ++ spin_lock_init(&fhsm->fhsm_spin); ++ init_waitqueue_head(&fhsm->fhsm_wqh); ++ atomic_set(&fhsm->fhsm_readable, 0); ++ fhsm->fhsm_expire ++ = msecs_to_jiffies(AUFS_FHSM_CACHE_DEF_SEC * MSEC_PER_SEC); ++ fhsm->fhsm_bottom = -1; ++} ++ ++void au_fhsm_set(struct au_sbinfo *sbinfo, unsigned int sec) ++{ ++ sbinfo->si_fhsm.fhsm_expire ++ = msecs_to_jiffies(sec * MSEC_PER_SEC); ++} ++ ++void au_fhsm_show(struct seq_file *seq, struct au_sbinfo *sbinfo) ++{ ++ unsigned int u; ++ ++ if (!au_ftest_si(sbinfo, FHSM)) ++ return; ++ ++ u = jiffies_to_msecs(sbinfo->si_fhsm.fhsm_expire) / MSEC_PER_SEC; ++ if (u != AUFS_FHSM_CACHE_DEF_SEC) ++ seq_printf(seq, ",fhsm_sec=%u", u); ++} +diff --git a/fs/aufs/file.c b/fs/aufs/file.c +new file mode 100644 +index 0000000..65926e8 +--- /dev/null ++++ b/fs/aufs/file.c +@@ -0,0 +1,844 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * handling file/dir, and address_space operation ++ */ ++ ++#ifdef CONFIG_AUFS_DEBUG ++#include ++#endif ++#include ++#include "aufs.h" ++ ++/* drop flags for writing */ ++unsigned int au_file_roflags(unsigned int flags) ++{ ++ flags &= ~(O_WRONLY | O_RDWR | O_APPEND | O_CREAT | O_TRUNC); ++ flags |= O_RDONLY | O_NOATIME; ++ return flags; ++} ++ ++/* common functions to regular file and dir */ ++struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags, ++ struct file *file, int force_wr) ++{ ++ struct file *h_file; ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ struct super_block *sb; ++ struct au_branch *br; ++ struct path h_path; ++ int err; ++ ++ /* a race condition can happen between open and unlink/rmdir */ ++ h_file = ERR_PTR(-ENOENT); ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (au_test_nfsd() && (!h_dentry || d_is_negative(h_dentry))) ++ goto out; ++ h_inode = d_inode(h_dentry); ++ spin_lock(&h_dentry->d_lock); ++ err = (!d_unhashed(dentry) && d_unlinked(h_dentry)) ++ /* || !d_inode(dentry)->i_nlink */ ++ ; ++ spin_unlock(&h_dentry->d_lock); ++ if (unlikely(err)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ br = au_sbr(sb, bindex); ++ err = au_br_test_oflag(flags, br); ++ h_file = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ ++ /* drop flags for writing */ ++ if (au_test_ro(sb, bindex, d_inode(dentry))) { ++ if (force_wr && !(flags & O_WRONLY)) ++ force_wr = 0; ++ flags = au_file_roflags(flags); ++ if (force_wr) { ++ h_file = ERR_PTR(-EROFS); ++ flags = au_file_roflags(flags); ++ if (unlikely(vfsub_native_ro(h_inode) ++ || IS_APPEND(h_inode))) ++ goto out; ++ flags &= ~O_ACCMODE; ++ flags |= O_WRONLY; ++ } ++ } ++ flags &= ~O_CREAT; ++ au_br_get(br); ++ h_path.dentry = h_dentry; ++ h_path.mnt = au_br_mnt(br); ++ h_file = vfsub_dentry_open(&h_path, flags); ++ if (IS_ERR(h_file)) ++ goto out_br; ++ ++ if (flags & __FMODE_EXEC) { ++ err = deny_write_access(h_file); ++ if (unlikely(err)) { ++ fput(h_file); ++ h_file = ERR_PTR(err); ++ goto out_br; ++ } ++ } ++ fsnotify_open(h_file); ++ goto out; /* success */ ++ ++out_br: ++ au_br_put(br); ++out: ++ return h_file; ++} ++ ++static int au_cmoo(struct dentry *dentry) ++{ ++ int err, cmoo; ++ unsigned int udba; ++ struct path h_path; ++ struct au_pin pin; ++ struct au_cp_generic cpg = { ++ .dentry = dentry, ++ .bdst = -1, ++ .bsrc = -1, ++ .len = -1, ++ .pin = &pin, ++ .flags = AuCpup_DTIME | AuCpup_HOPEN ++ }; ++ struct inode *delegated; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ struct au_fhsm *fhsm; ++ pid_t pid; ++ struct au_branch *br; ++ struct dentry *parent; ++ struct au_hinode *hdir; ++ ++ DiMustWriteLock(dentry); ++ IiMustWriteLock(d_inode(dentry)); ++ ++ err = 0; ++ if (IS_ROOT(dentry)) ++ goto out; ++ cpg.bsrc = au_dbtop(dentry); ++ if (!cpg.bsrc) ++ goto out; ++ ++ sb = dentry->d_sb; ++ sbinfo = au_sbi(sb); ++ fhsm = &sbinfo->si_fhsm; ++ pid = au_fhsm_pid(fhsm); ++ if (pid ++ && (current->pid == pid ++ || current->real_parent->pid == pid)) ++ goto out; ++ ++ br = au_sbr(sb, cpg.bsrc); ++ cmoo = au_br_cmoo(br->br_perm); ++ if (!cmoo) ++ goto out; ++ if (!d_is_reg(dentry)) ++ cmoo &= AuBrAttr_COO_ALL; ++ if (!cmoo) ++ goto out; ++ ++ parent = dget_parent(dentry); ++ di_write_lock_parent(parent); ++ err = au_wbr_do_copyup_bu(dentry, cpg.bsrc - 1); ++ cpg.bdst = err; ++ if (unlikely(err < 0)) { ++ err = 0; /* there is no upper writable branch */ ++ goto out_dgrade; ++ } ++ AuDbg("bsrc %d, bdst %d\n", cpg.bsrc, cpg.bdst); ++ ++ /* do not respect the coo attrib for the target branch */ ++ err = au_cpup_dirs(dentry, cpg.bdst); ++ if (unlikely(err)) ++ goto out_dgrade; ++ ++ di_downgrade_lock(parent, AuLock_IR); ++ udba = au_opt_udba(sb); ++ err = au_pin(&pin, dentry, cpg.bdst, udba, ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (unlikely(err)) ++ goto out_parent; ++ ++ err = au_sio_cpup_simple(&cpg); ++ au_unpin(&pin); ++ if (unlikely(err)) ++ goto out_parent; ++ if (!(cmoo & AuBrWAttr_MOO)) ++ goto out_parent; /* success */ ++ ++ err = au_pin(&pin, dentry, cpg.bsrc, udba, ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (unlikely(err)) ++ goto out_parent; ++ ++ h_path.mnt = au_br_mnt(br); ++ h_path.dentry = au_h_dptr(dentry, cpg.bsrc); ++ hdir = au_hi(d_inode(parent), cpg.bsrc); ++ delegated = NULL; ++ err = vfsub_unlink(hdir->hi_inode, &h_path, &delegated, /*force*/1); ++ au_unpin(&pin); ++ /* todo: keep h_dentry or not? */ ++ if (unlikely(err == -EWOULDBLOCK)) { ++ pr_warn("cannot retry for NFSv4 delegation" ++ " for an internal unlink\n"); ++ iput(delegated); ++ } ++ if (unlikely(err)) { ++ pr_err("unlink %pd after coo failed (%d), ignored\n", ++ dentry, err); ++ err = 0; ++ } ++ goto out_parent; /* success */ ++ ++out_dgrade: ++ di_downgrade_lock(parent, AuLock_IR); ++out_parent: ++ di_read_unlock(parent, AuLock_IR); ++ dput(parent); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++int au_do_open(struct file *file, struct au_do_open_args *args) ++{ ++ int err, no_lock = args->no_lock; ++ struct dentry *dentry; ++ struct au_finfo *finfo; ++ ++ if (!no_lock) ++ err = au_finfo_init(file, args->fidir); ++ else { ++ lockdep_off(); ++ err = au_finfo_init(file, args->fidir); ++ lockdep_on(); ++ } ++ if (unlikely(err)) ++ goto out; ++ ++ dentry = file->f_path.dentry; ++ AuDebugOn(IS_ERR_OR_NULL(dentry)); ++ if (!no_lock) { ++ di_write_lock_child(dentry); ++ err = au_cmoo(dentry); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (!err) ++ err = args->open(file, vfsub_file_flags(file), NULL); ++ di_read_unlock(dentry, AuLock_IR); ++ } else { ++ err = au_cmoo(dentry); ++ if (!err) ++ err = args->open(file, vfsub_file_flags(file), ++ args->h_file); ++ if (!err && au_fbtop(file) != au_dbtop(dentry)) ++ /* ++ * cmoo happens after h_file was opened. ++ * need to refresh file later. ++ */ ++ atomic_dec(&au_fi(file)->fi_generation); ++ } ++ ++ finfo = au_fi(file); ++ if (!err) { ++ finfo->fi_file = file; ++ au_sphl_add(&finfo->fi_hlist, ++ &au_sbi(file->f_path.dentry->d_sb)->si_files); ++ } ++ if (!no_lock) ++ fi_write_unlock(file); ++ else { ++ lockdep_off(); ++ fi_write_unlock(file); ++ lockdep_on(); ++ } ++ if (unlikely(err)) { ++ finfo->fi_hdir = NULL; ++ au_finfo_fin(file); ++ } ++ ++out: ++ return err; ++} ++ ++int au_reopen_nondir(struct file *file) ++{ ++ int err; ++ aufs_bindex_t btop; ++ struct dentry *dentry; ++ struct file *h_file, *h_file_tmp; ++ ++ dentry = file->f_path.dentry; ++ btop = au_dbtop(dentry); ++ h_file_tmp = NULL; ++ if (au_fbtop(file) == btop) { ++ h_file = au_hf_top(file); ++ if (file->f_mode == h_file->f_mode) ++ return 0; /* success */ ++ h_file_tmp = h_file; ++ get_file(h_file_tmp); ++ au_set_h_fptr(file, btop, NULL); ++ } ++ AuDebugOn(au_fi(file)->fi_hdir); ++ /* ++ * it can happen ++ * file exists on both of rw and ro ++ * open --> dbtop and fbtop are both 0 ++ * prepend a branch as rw, "rw" become ro ++ * remove rw/file ++ * delete the top branch, "rw" becomes rw again ++ * --> dbtop is 1, fbtop is still 0 ++ * write --> fbtop is 0 but dbtop is 1 ++ */ ++ /* AuDebugOn(au_fbtop(file) < btop); */ ++ ++ h_file = au_h_open(dentry, btop, vfsub_file_flags(file) & ~O_TRUNC, ++ file, /*force_wr*/0); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) { ++ if (h_file_tmp) { ++ au_sbr_get(dentry->d_sb, btop); ++ au_set_h_fptr(file, btop, h_file_tmp); ++ h_file_tmp = NULL; ++ } ++ goto out; /* todo: close all? */ ++ } ++ ++ err = 0; ++ au_set_fbtop(file, btop); ++ au_set_h_fptr(file, btop, h_file); ++ au_update_figen(file); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ ++out: ++ if (h_file_tmp) ++ fput(h_file_tmp); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_reopen_wh(struct file *file, aufs_bindex_t btgt, ++ struct dentry *hi_wh) ++{ ++ int err; ++ aufs_bindex_t btop; ++ struct au_dinfo *dinfo; ++ struct dentry *h_dentry; ++ struct au_hdentry *hdp; ++ ++ dinfo = au_di(file->f_path.dentry); ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ btop = dinfo->di_btop; ++ dinfo->di_btop = btgt; ++ hdp = dinfo->di_hdentry; ++ h_dentry = hdp[0 + btgt].hd_dentry; ++ hdp[0 + btgt].hd_dentry = hi_wh; ++ err = au_reopen_nondir(file); ++ hdp[0 + btgt].hd_dentry = h_dentry; ++ dinfo->di_btop = btop; ++ ++ return err; ++} ++ ++static int au_ready_to_write_wh(struct file *file, loff_t len, ++ aufs_bindex_t bcpup, struct au_pin *pin) ++{ ++ int err; ++ struct inode *inode, *h_inode; ++ struct dentry *h_dentry, *hi_wh; ++ struct au_cp_generic cpg = { ++ .dentry = file->f_path.dentry, ++ .bdst = bcpup, ++ .bsrc = -1, ++ .len = len, ++ .pin = pin ++ }; ++ ++ au_update_dbtop(cpg.dentry); ++ inode = d_inode(cpg.dentry); ++ h_inode = NULL; ++ if (au_dbtop(cpg.dentry) <= bcpup ++ && au_dbbot(cpg.dentry) >= bcpup) { ++ h_dentry = au_h_dptr(cpg.dentry, bcpup); ++ if (h_dentry && d_is_positive(h_dentry)) ++ h_inode = d_inode(h_dentry); ++ } ++ hi_wh = au_hi_wh(inode, bcpup); ++ if (!hi_wh && !h_inode) ++ err = au_sio_cpup_wh(&cpg, file); ++ else ++ /* already copied-up after unlink */ ++ err = au_reopen_wh(file, bcpup, hi_wh); ++ ++ if (!err ++ && (inode->i_nlink > 1 ++ || (inode->i_state & I_LINKABLE)) ++ && au_opt_test(au_mntflags(cpg.dentry->d_sb), PLINK)) ++ au_plink_append(inode, bcpup, au_h_dptr(cpg.dentry, bcpup)); ++ ++ return err; ++} ++ ++/* ++ * prepare the @file for writing. ++ */ ++int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin) ++{ ++ int err; ++ aufs_bindex_t dbtop; ++ struct dentry *parent; ++ struct inode *inode; ++ struct super_block *sb; ++ struct file *h_file; ++ struct au_cp_generic cpg = { ++ .dentry = file->f_path.dentry, ++ .bdst = -1, ++ .bsrc = -1, ++ .len = len, ++ .pin = pin, ++ .flags = AuCpup_DTIME ++ }; ++ ++ sb = cpg.dentry->d_sb; ++ inode = d_inode(cpg.dentry); ++ cpg.bsrc = au_fbtop(file); ++ err = au_test_ro(sb, cpg.bsrc, inode); ++ if (!err && (au_hf_top(file)->f_mode & FMODE_WRITE)) { ++ err = au_pin(pin, cpg.dentry, cpg.bsrc, AuOpt_UDBA_NONE, ++ /*flags*/0); ++ goto out; ++ } ++ ++ /* need to cpup or reopen */ ++ parent = dget_parent(cpg.dentry); ++ di_write_lock_parent(parent); ++ err = AuWbrCopyup(au_sbi(sb), cpg.dentry); ++ cpg.bdst = err; ++ if (unlikely(err < 0)) ++ goto out_dgrade; ++ err = 0; ++ ++ if (!d_unhashed(cpg.dentry) && !au_h_dptr(parent, cpg.bdst)) { ++ err = au_cpup_dirs(cpg.dentry, cpg.bdst); ++ if (unlikely(err)) ++ goto out_dgrade; ++ } ++ ++ err = au_pin(pin, cpg.dentry, cpg.bdst, AuOpt_UDBA_NONE, ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (unlikely(err)) ++ goto out_dgrade; ++ ++ dbtop = au_dbtop(cpg.dentry); ++ if (dbtop <= cpg.bdst) ++ cpg.bsrc = cpg.bdst; ++ ++ if (dbtop <= cpg.bdst /* just reopen */ ++ || !d_unhashed(cpg.dentry) /* copyup and reopen */ ++ ) { ++ h_file = au_h_open_pre(cpg.dentry, cpg.bsrc, /*force_wr*/0); ++ if (IS_ERR(h_file)) ++ err = PTR_ERR(h_file); ++ else { ++ di_downgrade_lock(parent, AuLock_IR); ++ if (dbtop > cpg.bdst) ++ err = au_sio_cpup_simple(&cpg); ++ if (!err) ++ err = au_reopen_nondir(file); ++ au_h_open_post(cpg.dentry, cpg.bsrc, h_file); ++ } ++ } else { /* copyup as wh and reopen */ ++ /* ++ * since writable hfsplus branch is not supported, ++ * h_open_pre/post() are unnecessary. ++ */ ++ err = au_ready_to_write_wh(file, len, cpg.bdst, pin); ++ di_downgrade_lock(parent, AuLock_IR); ++ } ++ ++ if (!err) { ++ au_pin_set_parent_lflag(pin, /*lflag*/0); ++ goto out_dput; /* success */ ++ } ++ au_unpin(pin); ++ goto out_unlock; ++ ++out_dgrade: ++ di_downgrade_lock(parent, AuLock_IR); ++out_unlock: ++ di_read_unlock(parent, AuLock_IR); ++out_dput: ++ dput(parent); ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_do_flush(struct file *file, fl_owner_t id, ++ int (*flush)(struct file *file, fl_owner_t id)) ++{ ++ int err; ++ struct super_block *sb; ++ struct inode *inode; ++ ++ inode = file_inode(file); ++ sb = inode->i_sb; ++ si_noflush_read_lock(sb); ++ fi_read_lock(file); ++ ii_read_lock_child(inode); ++ ++ err = flush(file, id); ++ au_cpup_attr_timesizes(inode); ++ ++ ii_read_unlock(inode); ++ fi_read_unlock(file); ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_file_refresh_by_inode(struct file *file, int *need_reopen) ++{ ++ int err; ++ struct au_pin pin; ++ struct au_finfo *finfo; ++ struct dentry *parent, *hi_wh; ++ struct inode *inode; ++ struct super_block *sb; ++ struct au_cp_generic cpg = { ++ .dentry = file->f_path.dentry, ++ .bdst = -1, ++ .bsrc = -1, ++ .len = -1, ++ .pin = &pin, ++ .flags = AuCpup_DTIME ++ }; ++ ++ FiMustWriteLock(file); ++ ++ err = 0; ++ finfo = au_fi(file); ++ sb = cpg.dentry->d_sb; ++ inode = d_inode(cpg.dentry); ++ cpg.bdst = au_ibtop(inode); ++ if (cpg.bdst == finfo->fi_btop || IS_ROOT(cpg.dentry)) ++ goto out; ++ ++ parent = dget_parent(cpg.dentry); ++ if (au_test_ro(sb, cpg.bdst, inode)) { ++ di_read_lock_parent(parent, !AuLock_IR); ++ err = AuWbrCopyup(au_sbi(sb), cpg.dentry); ++ cpg.bdst = err; ++ di_read_unlock(parent, !AuLock_IR); ++ if (unlikely(err < 0)) ++ goto out_parent; ++ err = 0; ++ } ++ ++ di_read_lock_parent(parent, AuLock_IR); ++ hi_wh = au_hi_wh(inode, cpg.bdst); ++ if (!S_ISDIR(inode->i_mode) ++ && au_opt_test(au_mntflags(sb), PLINK) ++ && au_plink_test(inode) ++ && !d_unhashed(cpg.dentry) ++ && cpg.bdst < au_dbtop(cpg.dentry)) { ++ err = au_test_and_cpup_dirs(cpg.dentry, cpg.bdst); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ /* always superio. */ ++ err = au_pin(&pin, cpg.dentry, cpg.bdst, AuOpt_UDBA_NONE, ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (!err) { ++ err = au_sio_cpup_simple(&cpg); ++ au_unpin(&pin); ++ } ++ } else if (hi_wh) { ++ /* already copied-up after unlink */ ++ err = au_reopen_wh(file, cpg.bdst, hi_wh); ++ *need_reopen = 0; ++ } ++ ++out_unlock: ++ di_read_unlock(parent, AuLock_IR); ++out_parent: ++ dput(parent); ++out: ++ return err; ++} ++ ++static void au_do_refresh_dir(struct file *file) ++{ ++ aufs_bindex_t bindex, bbot, new_bindex, brid; ++ struct au_hfile *p, tmp, *q; ++ struct au_finfo *finfo; ++ struct super_block *sb; ++ struct au_fidir *fidir; ++ ++ FiMustWriteLock(file); ++ ++ sb = file->f_path.dentry->d_sb; ++ finfo = au_fi(file); ++ fidir = finfo->fi_hdir; ++ AuDebugOn(!fidir); ++ p = fidir->fd_hfile + finfo->fi_btop; ++ brid = p->hf_br->br_id; ++ bbot = fidir->fd_bbot; ++ for (bindex = finfo->fi_btop; bindex <= bbot; bindex++, p++) { ++ if (!p->hf_file) ++ continue; ++ ++ new_bindex = au_br_index(sb, p->hf_br->br_id); ++ if (new_bindex == bindex) ++ continue; ++ if (new_bindex < 0) { ++ au_set_h_fptr(file, bindex, NULL); ++ continue; ++ } ++ ++ /* swap two lower inode, and loop again */ ++ q = fidir->fd_hfile + new_bindex; ++ tmp = *q; ++ *q = *p; ++ *p = tmp; ++ if (tmp.hf_file) { ++ bindex--; ++ p--; ++ } ++ } ++ ++ p = fidir->fd_hfile; ++ if (!au_test_mmapped(file) && !d_unlinked(file->f_path.dentry)) { ++ bbot = au_sbbot(sb); ++ for (finfo->fi_btop = 0; finfo->fi_btop <= bbot; ++ finfo->fi_btop++, p++) ++ if (p->hf_file) { ++ if (file_inode(p->hf_file)) ++ break; ++ au_hfput(p, file); ++ } ++ } else { ++ bbot = au_br_index(sb, brid); ++ for (finfo->fi_btop = 0; finfo->fi_btop < bbot; ++ finfo->fi_btop++, p++) ++ if (p->hf_file) ++ au_hfput(p, file); ++ bbot = au_sbbot(sb); ++ } ++ ++ p = fidir->fd_hfile + bbot; ++ for (fidir->fd_bbot = bbot; fidir->fd_bbot >= finfo->fi_btop; ++ fidir->fd_bbot--, p--) ++ if (p->hf_file) { ++ if (file_inode(p->hf_file)) ++ break; ++ au_hfput(p, file); ++ } ++ AuDebugOn(fidir->fd_bbot < finfo->fi_btop); ++} ++ ++/* ++ * after branch manipulating, refresh the file. ++ */ ++static int refresh_file(struct file *file, int (*reopen)(struct file *file)) ++{ ++ int err, need_reopen; ++ aufs_bindex_t bbot, bindex; ++ struct dentry *dentry; ++ struct au_finfo *finfo; ++ struct au_hfile *hfile; ++ ++ dentry = file->f_path.dentry; ++ finfo = au_fi(file); ++ if (!finfo->fi_hdir) { ++ hfile = &finfo->fi_htop; ++ AuDebugOn(!hfile->hf_file); ++ bindex = au_br_index(dentry->d_sb, hfile->hf_br->br_id); ++ AuDebugOn(bindex < 0); ++ if (bindex != finfo->fi_btop) ++ au_set_fbtop(file, bindex); ++ } else { ++ err = au_fidir_realloc(finfo, au_sbbot(dentry->d_sb) + 1); ++ if (unlikely(err)) ++ goto out; ++ au_do_refresh_dir(file); ++ } ++ ++ err = 0; ++ need_reopen = 1; ++ if (!au_test_mmapped(file)) ++ err = au_file_refresh_by_inode(file, &need_reopen); ++ if (!err && need_reopen && !d_unlinked(dentry)) ++ err = reopen(file); ++ if (!err) { ++ au_update_figen(file); ++ goto out; /* success */ ++ } ++ ++ /* error, close all lower files */ ++ if (finfo->fi_hdir) { ++ bbot = au_fbbot_dir(file); ++ for (bindex = au_fbtop(file); bindex <= bbot; bindex++) ++ au_set_h_fptr(file, bindex, NULL); ++ } ++ ++out: ++ return err; ++} ++ ++/* common function to regular file and dir */ ++int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file), ++ int wlock) ++{ ++ int err; ++ unsigned int sigen, figen; ++ aufs_bindex_t btop; ++ unsigned char pseudo_link; ++ struct dentry *dentry; ++ struct inode *inode; ++ ++ err = 0; ++ dentry = file->f_path.dentry; ++ inode = d_inode(dentry); ++ sigen = au_sigen(dentry->d_sb); ++ fi_write_lock(file); ++ figen = au_figen(file); ++ di_write_lock_child(dentry); ++ btop = au_dbtop(dentry); ++ pseudo_link = (btop != au_ibtop(inode)); ++ if (sigen == figen && !pseudo_link && au_fbtop(file) == btop) { ++ if (!wlock) { ++ di_downgrade_lock(dentry, AuLock_IR); ++ fi_downgrade_lock(file); ++ } ++ goto out; /* success */ ++ } ++ ++ AuDbg("sigen %d, figen %d\n", sigen, figen); ++ if (au_digen_test(dentry, sigen)) { ++ err = au_reval_dpath(dentry, sigen); ++ AuDebugOn(!err && au_digen_test(dentry, sigen)); ++ } ++ ++ if (!err) ++ err = refresh_file(file, reopen); ++ if (!err) { ++ if (!wlock) { ++ di_downgrade_lock(dentry, AuLock_IR); ++ fi_downgrade_lock(file); ++ } ++ } else { ++ di_write_unlock(dentry); ++ fi_write_unlock(file); ++ } ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* cf. aufs_nopage() */ ++/* for madvise(2) */ ++static int aufs_readpage(struct file *file __maybe_unused, struct page *page) ++{ ++ unlock_page(page); ++ return 0; ++} ++ ++/* it will never be called, but necessary to support O_DIRECT */ ++static ssize_t aufs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ++ loff_t offset) ++{ BUG(); return 0; } ++ ++/* they will never be called. */ ++#ifdef CONFIG_AUFS_DEBUG ++static int aufs_write_begin(struct file *file, struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned flags, ++ struct page **pagep, void **fsdata) ++{ AuUnsupport(); return 0; } ++static int aufs_write_end(struct file *file, struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned copied, ++ struct page *page, void *fsdata) ++{ AuUnsupport(); return 0; } ++static int aufs_writepage(struct page *page, struct writeback_control *wbc) ++{ AuUnsupport(); return 0; } ++ ++static int aufs_set_page_dirty(struct page *page) ++{ AuUnsupport(); return 0; } ++static void aufs_invalidatepage(struct page *page, unsigned int offset, ++ unsigned int length) ++{ AuUnsupport(); } ++static int aufs_releasepage(struct page *page, gfp_t gfp) ++{ AuUnsupport(); return 0; } ++#if 0 /* called by memory compaction regardless file */ ++static int aufs_migratepage(struct address_space *mapping, struct page *newpage, ++ struct page *page, enum migrate_mode mode) ++{ AuUnsupport(); return 0; } ++#endif ++static int aufs_launder_page(struct page *page) ++{ AuUnsupport(); return 0; } ++static int aufs_is_partially_uptodate(struct page *page, ++ unsigned long from, ++ unsigned long count) ++{ AuUnsupport(); return 0; } ++static void aufs_is_dirty_writeback(struct page *page, bool *dirty, ++ bool *writeback) ++{ AuUnsupport(); } ++static int aufs_error_remove_page(struct address_space *mapping, ++ struct page *page) ++{ AuUnsupport(); return 0; } ++static int aufs_swap_activate(struct swap_info_struct *sis, struct file *file, ++ sector_t *span) ++{ AuUnsupport(); return 0; } ++static void aufs_swap_deactivate(struct file *file) ++{ AuUnsupport(); } ++#endif /* CONFIG_AUFS_DEBUG */ ++ ++const struct address_space_operations aufs_aop = { ++ .readpage = aufs_readpage, ++ .direct_IO = aufs_direct_IO, ++#ifdef CONFIG_AUFS_DEBUG ++ .writepage = aufs_writepage, ++ /* no writepages, because of writepage */ ++ .set_page_dirty = aufs_set_page_dirty, ++ /* no readpages, because of readpage */ ++ .write_begin = aufs_write_begin, ++ .write_end = aufs_write_end, ++ /* no bmap, no block device */ ++ .invalidatepage = aufs_invalidatepage, ++ .releasepage = aufs_releasepage, ++ /* is fallback_migrate_page ok? */ ++ /* .migratepage = aufs_migratepage, */ ++ .launder_page = aufs_launder_page, ++ .is_partially_uptodate = aufs_is_partially_uptodate, ++ .is_dirty_writeback = aufs_is_dirty_writeback, ++ .error_remove_page = aufs_error_remove_page, ++ .swap_activate = aufs_swap_activate, ++ .swap_deactivate = aufs_swap_deactivate ++#endif /* CONFIG_AUFS_DEBUG */ ++}; +diff --git a/fs/aufs/file.h b/fs/aufs/file.h +new file mode 100644 +index 0000000..cc98251 +--- /dev/null ++++ b/fs/aufs/file.h +@@ -0,0 +1,291 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * file operations ++ */ ++ ++#ifndef __AUFS_FILE_H__ ++#define __AUFS_FILE_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include ++#include "rwsem.h" ++ ++struct au_branch; ++struct au_hfile { ++ struct file *hf_file; ++ struct au_branch *hf_br; ++}; ++ ++struct au_vdir; ++struct au_fidir { ++ aufs_bindex_t fd_bbot; ++ aufs_bindex_t fd_nent; ++ struct au_vdir *fd_vdir_cache; ++ struct au_hfile fd_hfile[]; ++}; ++ ++static inline int au_fidir_sz(int nent) ++{ ++ AuDebugOn(nent < 0); ++ return sizeof(struct au_fidir) + sizeof(struct au_hfile) * nent; ++} ++ ++struct au_finfo { ++ atomic_t fi_generation; ++ ++ struct au_rwsem fi_rwsem; ++ aufs_bindex_t fi_btop; ++ ++ /* do not union them */ ++ struct { /* for non-dir */ ++ struct au_hfile fi_htop; ++ atomic_t fi_mmapped; ++ }; ++ struct au_fidir *fi_hdir; /* for dir only */ ++ ++ struct hlist_node fi_hlist; ++ struct file *fi_file; /* very ugly */ ++} ____cacheline_aligned_in_smp; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* file.c */ ++extern const struct address_space_operations aufs_aop; ++unsigned int au_file_roflags(unsigned int flags); ++struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags, ++ struct file *file, int force_wr); ++struct au_do_open_args { ++ int no_lock; ++ int (*open)(struct file *file, int flags, ++ struct file *h_file); ++ struct au_fidir *fidir; ++ struct file *h_file; ++}; ++int au_do_open(struct file *file, struct au_do_open_args *args); ++int au_reopen_nondir(struct file *file); ++struct au_pin; ++int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin); ++int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file), ++ int wlock); ++int au_do_flush(struct file *file, fl_owner_t id, ++ int (*flush)(struct file *file, fl_owner_t id)); ++ ++/* poll.c */ ++#ifdef CONFIG_AUFS_POLL ++unsigned int aufs_poll(struct file *file, poll_table *wait); ++#endif ++ ++#ifdef CONFIG_AUFS_BR_HFSPLUS ++/* hfsplus.c */ ++struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex, ++ int force_wr); ++void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex, ++ struct file *h_file); ++#else ++AuStub(struct file *, au_h_open_pre, return NULL, struct dentry *dentry, ++ aufs_bindex_t bindex, int force_wr) ++AuStubVoid(au_h_open_post, struct dentry *dentry, aufs_bindex_t bindex, ++ struct file *h_file); ++#endif ++ ++/* f_op.c */ ++extern const struct file_operations aufs_file_fop; ++int au_do_open_nondir(struct file *file, int flags, struct file *h_file); ++int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file); ++struct file *au_read_pre(struct file *file, int keep_fi); ++ ++/* finfo.c */ ++void au_hfput(struct au_hfile *hf, struct file *file); ++void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, ++ struct file *h_file); ++ ++void au_update_figen(struct file *file); ++struct au_fidir *au_fidir_alloc(struct super_block *sb); ++int au_fidir_realloc(struct au_finfo *finfo, int nbr); ++ ++void au_fi_init_once(void *_fi); ++void au_finfo_fin(struct file *file); ++int au_finfo_init(struct file *file, struct au_fidir *fidir); ++ ++/* ioctl.c */ ++long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg); ++#ifdef CONFIG_COMPAT ++long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd, ++ unsigned long arg); ++long aufs_compat_ioctl_nondir(struct file *file, unsigned int cmd, ++ unsigned long arg); ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct au_finfo *au_fi(struct file *file) ++{ ++ return file->private_data; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * fi_read_lock, fi_write_lock, ++ * fi_read_unlock, fi_write_unlock, fi_downgrade_lock ++ */ ++AuSimpleRwsemFuncs(fi, struct file *f, &au_fi(f)->fi_rwsem); ++ ++#define FiMustNoWaiters(f) AuRwMustNoWaiters(&au_fi(f)->fi_rwsem) ++#define FiMustAnyLock(f) AuRwMustAnyLock(&au_fi(f)->fi_rwsem) ++#define FiMustWriteLock(f) AuRwMustWriteLock(&au_fi(f)->fi_rwsem) ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* todo: hard/soft set? */ ++static inline aufs_bindex_t au_fbtop(struct file *file) ++{ ++ FiMustAnyLock(file); ++ return au_fi(file)->fi_btop; ++} ++ ++static inline aufs_bindex_t au_fbbot_dir(struct file *file) ++{ ++ FiMustAnyLock(file); ++ AuDebugOn(!au_fi(file)->fi_hdir); ++ return au_fi(file)->fi_hdir->fd_bbot; ++} ++ ++static inline struct au_vdir *au_fvdir_cache(struct file *file) ++{ ++ FiMustAnyLock(file); ++ AuDebugOn(!au_fi(file)->fi_hdir); ++ return au_fi(file)->fi_hdir->fd_vdir_cache; ++} ++ ++static inline void au_set_fbtop(struct file *file, aufs_bindex_t bindex) ++{ ++ FiMustWriteLock(file); ++ au_fi(file)->fi_btop = bindex; ++} ++ ++static inline void au_set_fbbot_dir(struct file *file, aufs_bindex_t bindex) ++{ ++ FiMustWriteLock(file); ++ AuDebugOn(!au_fi(file)->fi_hdir); ++ au_fi(file)->fi_hdir->fd_bbot = bindex; ++} ++ ++static inline void au_set_fvdir_cache(struct file *file, ++ struct au_vdir *vdir_cache) ++{ ++ FiMustWriteLock(file); ++ AuDebugOn(!au_fi(file)->fi_hdir); ++ au_fi(file)->fi_hdir->fd_vdir_cache = vdir_cache; ++} ++ ++static inline struct file *au_hf_top(struct file *file) ++{ ++ FiMustAnyLock(file); ++ AuDebugOn(au_fi(file)->fi_hdir); ++ return au_fi(file)->fi_htop.hf_file; ++} ++ ++static inline struct file *au_hf_dir(struct file *file, aufs_bindex_t bindex) ++{ ++ FiMustAnyLock(file); ++ AuDebugOn(!au_fi(file)->fi_hdir); ++ return au_fi(file)->fi_hdir->fd_hfile[0 + bindex].hf_file; ++} ++ ++/* todo: memory barrier? */ ++static inline unsigned int au_figen(struct file *f) ++{ ++ return atomic_read(&au_fi(f)->fi_generation); ++} ++ ++static inline void au_set_mmapped(struct file *f) ++{ ++ if (atomic_inc_return(&au_fi(f)->fi_mmapped)) ++ return; ++ pr_warn("fi_mmapped wrapped around\n"); ++ while (!atomic_inc_return(&au_fi(f)->fi_mmapped)) ++ ; ++} ++ ++static inline void au_unset_mmapped(struct file *f) ++{ ++ atomic_dec(&au_fi(f)->fi_mmapped); ++} ++ ++static inline int au_test_mmapped(struct file *f) ++{ ++ return atomic_read(&au_fi(f)->fi_mmapped); ++} ++ ++/* customize vma->vm_file */ ++ ++static inline void au_do_vm_file_reset(struct vm_area_struct *vma, ++ struct file *file) ++{ ++ struct file *f; ++ ++ f = vma->vm_file; ++ get_file(file); ++ vma->vm_file = file; ++ fput(f); ++} ++ ++#ifdef CONFIG_MMU ++#define AuDbgVmRegion(file, vma) do {} while (0) ++ ++static inline void au_vm_file_reset(struct vm_area_struct *vma, ++ struct file *file) ++{ ++ au_do_vm_file_reset(vma, file); ++} ++#else ++#define AuDbgVmRegion(file, vma) \ ++ AuDebugOn((vma)->vm_region && (vma)->vm_region->vm_file != (file)) ++ ++static inline void au_vm_file_reset(struct vm_area_struct *vma, ++ struct file *file) ++{ ++ struct file *f; ++ ++ au_do_vm_file_reset(vma, file); ++ f = vma->vm_region->vm_file; ++ get_file(file); ++ vma->vm_region->vm_file = file; ++ fput(f); ++} ++#endif /* CONFIG_MMU */ ++ ++/* handle vma->vm_prfile */ ++static inline void au_vm_prfile_set(struct vm_area_struct *vma, ++ struct file *file) ++{ ++ get_file(file); ++ vma->vm_prfile = file; ++#ifndef CONFIG_MMU ++ get_file(file); ++ vma->vm_region->vm_prfile = file; ++#endif ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_FILE_H__ */ +diff --git a/fs/aufs/finfo.c b/fs/aufs/finfo.c +new file mode 100644 +index 0000000..179f21a +--- /dev/null ++++ b/fs/aufs/finfo.c +@@ -0,0 +1,149 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * file private data ++ */ ++ ++#include "aufs.h" ++ ++void au_hfput(struct au_hfile *hf, struct file *file) ++{ ++ /* todo: direct access f_flags */ ++ if (vfsub_file_flags(file) & __FMODE_EXEC) ++ allow_write_access(hf->hf_file); ++ fput(hf->hf_file); ++ hf->hf_file = NULL; ++ au_br_put(hf->hf_br); ++ hf->hf_br = NULL; ++} ++ ++void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, struct file *val) ++{ ++ struct au_finfo *finfo = au_fi(file); ++ struct au_hfile *hf; ++ struct au_fidir *fidir; ++ ++ fidir = finfo->fi_hdir; ++ if (!fidir) { ++ AuDebugOn(finfo->fi_btop != bindex); ++ hf = &finfo->fi_htop; ++ } else ++ hf = fidir->fd_hfile + bindex; ++ ++ if (hf && hf->hf_file) ++ au_hfput(hf, file); ++ if (val) { ++ FiMustWriteLock(file); ++ AuDebugOn(IS_ERR_OR_NULL(file->f_path.dentry)); ++ hf->hf_file = val; ++ hf->hf_br = au_sbr(file->f_path.dentry->d_sb, bindex); ++ } ++} ++ ++void au_update_figen(struct file *file) ++{ ++ atomic_set(&au_fi(file)->fi_generation, au_digen(file->f_path.dentry)); ++ /* smp_mb(); */ /* atomic_set */ ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_fidir *au_fidir_alloc(struct super_block *sb) ++{ ++ struct au_fidir *fidir; ++ int nbr; ++ ++ nbr = au_sbbot(sb) + 1; ++ if (nbr < 2) ++ nbr = 2; /* initial allocate for 2 branches */ ++ fidir = kzalloc(au_fidir_sz(nbr), GFP_NOFS); ++ if (fidir) { ++ fidir->fd_bbot = -1; ++ fidir->fd_nent = nbr; ++ } ++ ++ return fidir; ++} ++ ++int au_fidir_realloc(struct au_finfo *finfo, int nbr) ++{ ++ int err; ++ struct au_fidir *fidir, *p; ++ ++ AuRwMustWriteLock(&finfo->fi_rwsem); ++ fidir = finfo->fi_hdir; ++ AuDebugOn(!fidir); ++ ++ err = -ENOMEM; ++ p = au_kzrealloc(fidir, au_fidir_sz(fidir->fd_nent), au_fidir_sz(nbr), ++ GFP_NOFS); ++ if (p) { ++ p->fd_nent = nbr; ++ finfo->fi_hdir = p; ++ err = 0; ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_finfo_fin(struct file *file) ++{ ++ struct au_finfo *finfo; ++ ++ au_nfiles_dec(file->f_path.dentry->d_sb); ++ ++ finfo = au_fi(file); ++ AuDebugOn(finfo->fi_hdir); ++ AuRwDestroy(&finfo->fi_rwsem); ++ au_cache_free_finfo(finfo); ++} ++ ++void au_fi_init_once(void *_finfo) ++{ ++ struct au_finfo *finfo = _finfo; ++ ++ au_rw_init(&finfo->fi_rwsem); ++} ++ ++int au_finfo_init(struct file *file, struct au_fidir *fidir) ++{ ++ int err; ++ struct au_finfo *finfo; ++ struct dentry *dentry; ++ ++ err = -ENOMEM; ++ dentry = file->f_path.dentry; ++ finfo = au_cache_alloc_finfo(); ++ if (unlikely(!finfo)) ++ goto out; ++ ++ err = 0; ++ au_nfiles_inc(dentry->d_sb); ++ au_rw_write_lock(&finfo->fi_rwsem); ++ finfo->fi_btop = -1; ++ finfo->fi_hdir = fidir; ++ atomic_set(&finfo->fi_generation, au_digen(dentry)); ++ /* smp_mb(); */ /* atomic_set */ ++ ++ file->private_data = finfo; ++ ++out: ++ return err; ++} +diff --git a/fs/aufs/fstype.h b/fs/aufs/fstype.h +new file mode 100644 +index 0000000..2842400 +--- /dev/null ++++ b/fs/aufs/fstype.h +@@ -0,0 +1,400 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * judging filesystem type ++ */ ++ ++#ifndef __AUFS_FSTYPE_H__ ++#define __AUFS_FSTYPE_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include ++#include ++ ++static inline int au_test_aufs(struct super_block *sb) ++{ ++ return sb->s_magic == AUFS_SUPER_MAGIC; ++} ++ ++static inline const char *au_sbtype(struct super_block *sb) ++{ ++ return sb->s_type->name; ++} ++ ++static inline int au_test_iso9660(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_ISO9660_FS) || defined(CONFIG_ISO9660_FS_MODULE) ++ return sb->s_magic == ISOFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_romfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_ROMFS_FS) || defined(CONFIG_ROMFS_FS_MODULE) ++ return sb->s_magic == ROMFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_cramfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_CRAMFS) || defined(CONFIG_CRAMFS_MODULE) ++ return sb->s_magic == CRAMFS_MAGIC; ++#endif ++ return 0; ++} ++ ++static inline int au_test_nfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_NFS_FS) || defined(CONFIG_NFS_FS_MODULE) ++ return sb->s_magic == NFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_fuse(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE) ++ return sb->s_magic == FUSE_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_xfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_XFS_FS) || defined(CONFIG_XFS_FS_MODULE) ++ return sb->s_magic == XFS_SB_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_tmpfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_TMPFS ++ return sb->s_magic == TMPFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_ecryptfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_ECRYPT_FS) || defined(CONFIG_ECRYPT_FS_MODULE) ++ return !strcmp(au_sbtype(sb), "ecryptfs"); ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_ramfs(struct super_block *sb) ++{ ++ return sb->s_magic == RAMFS_MAGIC; ++} ++ ++static inline int au_test_ubifs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_UBIFS_FS) || defined(CONFIG_UBIFS_FS_MODULE) ++ return sb->s_magic == UBIFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_procfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_PROC_FS ++ return sb->s_magic == PROC_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_sysfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_SYSFS ++ return sb->s_magic == SYSFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_configfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_CONFIGFS_FS) || defined(CONFIG_CONFIGFS_FS_MODULE) ++ return sb->s_magic == CONFIGFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_minix(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_MINIX_FS) || defined(CONFIG_MINIX_FS_MODULE) ++ return sb->s_magic == MINIX3_SUPER_MAGIC ++ || sb->s_magic == MINIX2_SUPER_MAGIC ++ || sb->s_magic == MINIX2_SUPER_MAGIC2 ++ || sb->s_magic == MINIX_SUPER_MAGIC ++ || sb->s_magic == MINIX_SUPER_MAGIC2; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_fat(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_FAT_FS) || defined(CONFIG_FAT_FS_MODULE) ++ return sb->s_magic == MSDOS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_msdos(struct super_block *sb) ++{ ++ return au_test_fat(sb); ++} ++ ++static inline int au_test_vfat(struct super_block *sb) ++{ ++ return au_test_fat(sb); ++} ++ ++static inline int au_test_securityfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_SECURITYFS ++ return sb->s_magic == SECURITYFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_squashfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_SQUASHFS) || defined(CONFIG_SQUASHFS_MODULE) ++ return sb->s_magic == SQUASHFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_btrfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_BTRFS_FS) || defined(CONFIG_BTRFS_FS_MODULE) ++ return sb->s_magic == BTRFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_xenfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_XENFS) || defined(CONFIG_XENFS_MODULE) ++ return sb->s_magic == XENFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_debugfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_DEBUG_FS ++ return sb->s_magic == DEBUGFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_nilfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_NILFS) || defined(CONFIG_NILFS_MODULE) ++ return sb->s_magic == NILFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_hfsplus(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_HFSPLUS_FS) || defined(CONFIG_HFSPLUS_FS_MODULE) ++ return sb->s_magic == HFSPLUS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * they can't be an aufs branch. ++ */ ++static inline int au_test_fs_unsuppoted(struct super_block *sb) ++{ ++ return ++#ifndef CONFIG_AUFS_BR_RAMFS ++ au_test_ramfs(sb) || ++#endif ++ au_test_procfs(sb) ++ || au_test_sysfs(sb) ++ || au_test_configfs(sb) ++ || au_test_debugfs(sb) ++ || au_test_securityfs(sb) ++ || au_test_xenfs(sb) ++ || au_test_ecryptfs(sb) ++ /* || !strcmp(au_sbtype(sb), "unionfs") */ ++ || au_test_aufs(sb); /* will be supported in next version */ ++} ++ ++static inline int au_test_fs_remote(struct super_block *sb) ++{ ++ return !au_test_tmpfs(sb) ++#ifdef CONFIG_AUFS_BR_RAMFS ++ && !au_test_ramfs(sb) ++#endif ++ && !(sb->s_type->fs_flags & FS_REQUIRES_DEV); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * Note: these functions (below) are created after reading ->getattr() in all ++ * filesystems under linux/fs. it means we have to do so in every update... ++ */ ++ ++/* ++ * some filesystems require getattr to refresh the inode attributes before ++ * referencing. ++ * in most cases, we can rely on the inode attribute in NFS (or every remote fs) ++ * and leave the work for d_revalidate() ++ */ ++static inline int au_test_fs_refresh_iattr(struct super_block *sb) ++{ ++ return au_test_nfs(sb) ++ || au_test_fuse(sb) ++ /* || au_test_btrfs(sb) */ /* untested */ ++ ; ++} ++ ++/* ++ * filesystems which don't maintain i_size or i_blocks. ++ */ ++static inline int au_test_fs_bad_iattr_size(struct super_block *sb) ++{ ++ return au_test_xfs(sb) ++ || au_test_btrfs(sb) ++ || au_test_ubifs(sb) ++ || au_test_hfsplus(sb) /* maintained, but incorrect */ ++ /* || au_test_minix(sb) */ /* untested */ ++ ; ++} ++ ++/* ++ * filesystems which don't store the correct value in some of their inode ++ * attributes. ++ */ ++static inline int au_test_fs_bad_iattr(struct super_block *sb) ++{ ++ return au_test_fs_bad_iattr_size(sb) ++ || au_test_fat(sb) ++ || au_test_msdos(sb) ++ || au_test_vfat(sb); ++} ++ ++/* they don't check i_nlink in link(2) */ ++static inline int au_test_fs_no_limit_nlink(struct super_block *sb) ++{ ++ return au_test_tmpfs(sb) ++#ifdef CONFIG_AUFS_BR_RAMFS ++ || au_test_ramfs(sb) ++#endif ++ || au_test_ubifs(sb) ++ || au_test_hfsplus(sb); ++} ++ ++/* ++ * filesystems which sets S_NOATIME and S_NOCMTIME. ++ */ ++static inline int au_test_fs_notime(struct super_block *sb) ++{ ++ return au_test_nfs(sb) ++ || au_test_fuse(sb) ++ || au_test_ubifs(sb) ++ ; ++} ++ ++/* temporary support for i#1 in cramfs */ ++static inline int au_test_fs_unique_ino(struct inode *inode) ++{ ++ if (au_test_cramfs(inode->i_sb)) ++ return inode->i_ino != 1; ++ return 1; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * the filesystem where the xino files placed must support i/o after unlink and ++ * maintain i_size and i_blocks. ++ */ ++static inline int au_test_fs_bad_xino(struct super_block *sb) ++{ ++ return au_test_fs_remote(sb) ++ || au_test_fs_bad_iattr_size(sb) ++ /* don't want unnecessary work for xino */ ++ || au_test_aufs(sb) ++ || au_test_ecryptfs(sb) ++ || au_test_nilfs(sb); ++} ++ ++static inline int au_test_fs_trunc_xino(struct super_block *sb) ++{ ++ return au_test_tmpfs(sb) ++ || au_test_ramfs(sb); ++} ++ ++/* ++ * test if the @sb is real-readonly. ++ */ ++static inline int au_test_fs_rr(struct super_block *sb) ++{ ++ return au_test_squashfs(sb) ++ || au_test_iso9660(sb) ++ || au_test_cramfs(sb) ++ || au_test_romfs(sb); ++} ++ ++/* ++ * test if the @inode is nfs with 'noacl' option ++ * NFS always sets MS_POSIXACL regardless its mount option 'noacl.' ++ */ ++static inline int au_test_nfs_noacl(struct inode *inode) ++{ ++ return au_test_nfs(inode->i_sb) ++ /* && IS_POSIXACL(inode) */ ++ && !nfs_server_capable(inode, NFS_CAP_ACLS); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_FSTYPE_H__ */ +diff --git a/fs/aufs/hfsnotify.c b/fs/aufs/hfsnotify.c +new file mode 100644 +index 0000000..501b551 +--- /dev/null ++++ b/fs/aufs/hfsnotify.c +@@ -0,0 +1,287 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * fsnotify for the lower directories ++ */ ++ ++#include "aufs.h" ++ ++/* FS_IN_IGNORED is unnecessary */ ++static const __u32 AuHfsnMask = (FS_MOVED_TO | FS_MOVED_FROM | FS_DELETE ++ | FS_CREATE | FS_EVENT_ON_CHILD); ++static DECLARE_WAIT_QUEUE_HEAD(au_hfsn_wq); ++static __cacheline_aligned_in_smp atomic64_t au_hfsn_ifree = ATOMIC64_INIT(0); ++ ++static void au_hfsn_free_mark(struct fsnotify_mark *mark) ++{ ++ struct au_hnotify *hn = container_of(mark, struct au_hnotify, ++ hn_mark); ++ AuDbg("here\n"); ++ au_cache_free_hnotify(hn); ++ smp_mb__before_atomic(); ++ if (atomic64_dec_and_test(&au_hfsn_ifree)) ++ wake_up(&au_hfsn_wq); ++} ++ ++static int au_hfsn_alloc(struct au_hinode *hinode) ++{ ++ int err; ++ struct au_hnotify *hn; ++ struct super_block *sb; ++ struct au_branch *br; ++ struct fsnotify_mark *mark; ++ aufs_bindex_t bindex; ++ ++ hn = hinode->hi_notify; ++ sb = hn->hn_aufs_inode->i_sb; ++ bindex = au_br_index(sb, hinode->hi_id); ++ br = au_sbr(sb, bindex); ++ AuDebugOn(!br->br_hfsn); ++ ++ mark = &hn->hn_mark; ++ fsnotify_init_mark(mark, au_hfsn_free_mark); ++ mark->mask = AuHfsnMask; ++ /* ++ * by udba rename or rmdir, aufs assign a new inode to the known ++ * h_inode, so specify 1 to allow dups. ++ */ ++ lockdep_off(); ++ err = fsnotify_add_mark(mark, br->br_hfsn->hfsn_group, hinode->hi_inode, ++ /*mnt*/NULL, /*allow_dups*/1); ++ lockdep_on(); ++ ++ return err; ++} ++ ++static int au_hfsn_free(struct au_hinode *hinode, struct au_hnotify *hn) ++{ ++ struct fsnotify_mark *mark; ++ unsigned long long ull; ++ struct fsnotify_group *group; ++ ++ ull = atomic64_inc_return(&au_hfsn_ifree); ++ BUG_ON(!ull); ++ ++ mark = &hn->hn_mark; ++ spin_lock(&mark->lock); ++ group = mark->group; ++ fsnotify_get_group(group); ++ spin_unlock(&mark->lock); ++ lockdep_off(); ++ fsnotify_destroy_mark(mark, group); ++ fsnotify_put_mark(mark); ++ fsnotify_put_group(group); ++ lockdep_on(); ++ ++ /* free hn by myself */ ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_hfsn_ctl(struct au_hinode *hinode, int do_set) ++{ ++ struct fsnotify_mark *mark; ++ ++ mark = &hinode->hi_notify->hn_mark; ++ spin_lock(&mark->lock); ++ if (do_set) { ++ AuDebugOn(mark->mask & AuHfsnMask); ++ mark->mask |= AuHfsnMask; ++ } else { ++ AuDebugOn(!(mark->mask & AuHfsnMask)); ++ mark->mask &= ~AuHfsnMask; ++ } ++ spin_unlock(&mark->lock); ++ /* fsnotify_recalc_inode_mask(hinode->hi_inode); */ ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* #define AuDbgHnotify */ ++#ifdef AuDbgHnotify ++static char *au_hfsn_name(u32 mask) ++{ ++#ifdef CONFIG_AUFS_DEBUG ++#define test_ret(flag) \ ++ do { \ ++ if (mask & flag) \ ++ return #flag; \ ++ } while (0) ++ test_ret(FS_ACCESS); ++ test_ret(FS_MODIFY); ++ test_ret(FS_ATTRIB); ++ test_ret(FS_CLOSE_WRITE); ++ test_ret(FS_CLOSE_NOWRITE); ++ test_ret(FS_OPEN); ++ test_ret(FS_MOVED_FROM); ++ test_ret(FS_MOVED_TO); ++ test_ret(FS_CREATE); ++ test_ret(FS_DELETE); ++ test_ret(FS_DELETE_SELF); ++ test_ret(FS_MOVE_SELF); ++ test_ret(FS_UNMOUNT); ++ test_ret(FS_Q_OVERFLOW); ++ test_ret(FS_IN_IGNORED); ++ test_ret(FS_ISDIR); ++ test_ret(FS_IN_ONESHOT); ++ test_ret(FS_EVENT_ON_CHILD); ++ return ""; ++#undef test_ret ++#else ++ return "??"; ++#endif ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_hfsn_free_group(struct fsnotify_group *group) ++{ ++ struct au_br_hfsnotify *hfsn = group->private; ++ ++ AuDbg("here\n"); ++ kfree(hfsn); ++} ++ ++static int au_hfsn_handle_event(struct fsnotify_group *group, ++ struct inode *inode, ++ struct fsnotify_mark *inode_mark, ++ struct fsnotify_mark *vfsmount_mark, ++ u32 mask, void *data, int data_type, ++ const unsigned char *file_name, u32 cookie) ++{ ++ int err; ++ struct au_hnotify *hnotify; ++ struct inode *h_dir, *h_inode; ++ struct qstr h_child_qstr = QSTR_INIT(file_name, strlen(file_name)); ++ ++ AuDebugOn(data_type != FSNOTIFY_EVENT_INODE); ++ ++ err = 0; ++ /* if FS_UNMOUNT happens, there must be another bug */ ++ AuDebugOn(mask & FS_UNMOUNT); ++ if (mask & (FS_IN_IGNORED | FS_UNMOUNT)) ++ goto out; ++ ++ h_dir = inode; ++ h_inode = NULL; ++#ifdef AuDbgHnotify ++ au_debug_on(); ++ if (1 || h_child_qstr.len != sizeof(AUFS_XINO_FNAME) - 1 ++ || strncmp(h_child_qstr.name, AUFS_XINO_FNAME, h_child_qstr.len)) { ++ AuDbg("i%lu, mask 0x%x %s, hcname %.*s, hi%lu\n", ++ h_dir->i_ino, mask, au_hfsn_name(mask), ++ AuLNPair(&h_child_qstr), h_inode ? h_inode->i_ino : 0); ++ /* WARN_ON(1); */ ++ } ++ au_debug_off(); ++#endif ++ ++ AuDebugOn(!inode_mark); ++ hnotify = container_of(inode_mark, struct au_hnotify, hn_mark); ++ err = au_hnotify(h_dir, hnotify, mask, &h_child_qstr, h_inode); ++ ++out: ++ return err; ++} ++ ++static struct fsnotify_ops au_hfsn_ops = { ++ .handle_event = au_hfsn_handle_event, ++ .free_group_priv = au_hfsn_free_group ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_hfsn_fin_br(struct au_branch *br) ++{ ++ struct au_br_hfsnotify *hfsn; ++ ++ hfsn = br->br_hfsn; ++ if (hfsn) { ++ lockdep_off(); ++ fsnotify_put_group(hfsn->hfsn_group); ++ lockdep_on(); ++ } ++} ++ ++static int au_hfsn_init_br(struct au_branch *br, int perm) ++{ ++ int err; ++ struct fsnotify_group *group; ++ struct au_br_hfsnotify *hfsn; ++ ++ err = 0; ++ br->br_hfsn = NULL; ++ if (!au_br_hnotifyable(perm)) ++ goto out; ++ ++ err = -ENOMEM; ++ hfsn = kmalloc(sizeof(*hfsn), GFP_NOFS); ++ if (unlikely(!hfsn)) ++ goto out; ++ ++ err = 0; ++ group = fsnotify_alloc_group(&au_hfsn_ops); ++ if (IS_ERR(group)) { ++ err = PTR_ERR(group); ++ pr_err("fsnotify_alloc_group() failed, %d\n", err); ++ goto out_hfsn; ++ } ++ ++ group->private = hfsn; ++ hfsn->hfsn_group = group; ++ br->br_hfsn = hfsn; ++ goto out; /* success */ ++ ++out_hfsn: ++ kfree(hfsn); ++out: ++ return err; ++} ++ ++static int au_hfsn_reset_br(unsigned int udba, struct au_branch *br, int perm) ++{ ++ int err; ++ ++ err = 0; ++ if (!br->br_hfsn) ++ err = au_hfsn_init_br(br, perm); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_hfsn_fin(void) ++{ ++ AuDbg("au_hfsn_ifree %lld\n", (long long)atomic64_read(&au_hfsn_ifree)); ++ wait_event(au_hfsn_wq, !atomic64_read(&au_hfsn_ifree)); ++} ++ ++const struct au_hnotify_op au_hnotify_op = { ++ .ctl = au_hfsn_ctl, ++ .alloc = au_hfsn_alloc, ++ .free = au_hfsn_free, ++ ++ .fin = au_hfsn_fin, ++ ++ .reset_br = au_hfsn_reset_br, ++ .fin_br = au_hfsn_fin_br, ++ .init_br = au_hfsn_init_br ++}; +diff --git a/fs/aufs/hfsplus.c b/fs/aufs/hfsplus.c +new file mode 100644 +index 0000000..af256fa +--- /dev/null ++++ b/fs/aufs/hfsplus.c +@@ -0,0 +1,56 @@ ++/* ++ * Copyright (C) 2010-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * special support for filesystems which aqucires an inode mutex ++ * at final closing a file, eg, hfsplus. ++ * ++ * This trick is very simple and stupid, just to open the file before really ++ * neceeary open to tell hfsplus that this is not the final closing. ++ * The caller should call au_h_open_pre() after acquiring the inode mutex, ++ * and au_h_open_post() after releasing it. ++ */ ++ ++#include "aufs.h" ++ ++struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex, ++ int force_wr) ++{ ++ struct file *h_file; ++ struct dentry *h_dentry; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ AuDebugOn(!h_dentry); ++ AuDebugOn(d_is_negative(h_dentry)); ++ ++ h_file = NULL; ++ if (au_test_hfsplus(h_dentry->d_sb) ++ && d_is_reg(h_dentry)) ++ h_file = au_h_open(dentry, bindex, ++ O_RDONLY | O_NOATIME | O_LARGEFILE, ++ /*file*/NULL, force_wr); ++ return h_file; ++} ++ ++void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex, ++ struct file *h_file) ++{ ++ if (h_file) { ++ fput(h_file); ++ au_sbr_put(dentry->d_sb, bindex); ++ } ++} +diff --git a/fs/aufs/hnotify.c b/fs/aufs/hnotify.c +new file mode 100644 +index 0000000..d948ed5 +--- /dev/null ++++ b/fs/aufs/hnotify.c +@@ -0,0 +1,710 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * abstraction to notify the direct changes on lower directories ++ */ ++ ++#include "aufs.h" ++ ++int au_hn_alloc(struct au_hinode *hinode, struct inode *inode) ++{ ++ int err; ++ struct au_hnotify *hn; ++ ++ err = -ENOMEM; ++ hn = au_cache_alloc_hnotify(); ++ if (hn) { ++ hn->hn_aufs_inode = inode; ++ hinode->hi_notify = hn; ++ err = au_hnotify_op.alloc(hinode); ++ AuTraceErr(err); ++ if (unlikely(err)) { ++ hinode->hi_notify = NULL; ++ au_cache_free_hnotify(hn); ++ /* ++ * The upper dir was removed by udba, but the same named ++ * dir left. In this case, aufs assignes a new inode ++ * number and set the monitor again. ++ * For the lower dir, the old monitnor is still left. ++ */ ++ if (err == -EEXIST) ++ err = 0; ++ } ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++void au_hn_free(struct au_hinode *hinode) ++{ ++ struct au_hnotify *hn; ++ ++ hn = hinode->hi_notify; ++ if (hn) { ++ hinode->hi_notify = NULL; ++ if (au_hnotify_op.free(hinode, hn)) ++ au_cache_free_hnotify(hn); ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_hn_ctl(struct au_hinode *hinode, int do_set) ++{ ++ if (hinode->hi_notify) ++ au_hnotify_op.ctl(hinode, do_set); ++} ++ ++void au_hn_reset(struct inode *inode, unsigned int flags) ++{ ++ aufs_bindex_t bindex, bbot; ++ struct inode *hi; ++ struct dentry *iwhdentry; ++ ++ bbot = au_ibbot(inode); ++ for (bindex = au_ibtop(inode); bindex <= bbot; bindex++) { ++ hi = au_h_iptr(inode, bindex); ++ if (!hi) ++ continue; ++ ++ /* inode_lock_nested(hi, AuLsc_I_CHILD); */ ++ iwhdentry = au_hi_wh(inode, bindex); ++ if (iwhdentry) ++ dget(iwhdentry); ++ au_igrab(hi); ++ au_set_h_iptr(inode, bindex, NULL, 0); ++ au_set_h_iptr(inode, bindex, au_igrab(hi), ++ flags & ~AuHi_XINO); ++ iput(hi); ++ dput(iwhdentry); ++ /* inode_unlock(hi); */ ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int hn_xino(struct inode *inode, struct inode *h_inode) ++{ ++ int err; ++ aufs_bindex_t bindex, bbot, bfound, btop; ++ struct inode *h_i; ++ ++ err = 0; ++ if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { ++ pr_warn("branch root dir was changed\n"); ++ goto out; ++ } ++ ++ bfound = -1; ++ bbot = au_ibbot(inode); ++ btop = au_ibtop(inode); ++#if 0 /* reserved for future use */ ++ if (bindex == bbot) { ++ /* keep this ino in rename case */ ++ goto out; ++ } ++#endif ++ for (bindex = btop; bindex <= bbot; bindex++) ++ if (au_h_iptr(inode, bindex) == h_inode) { ++ bfound = bindex; ++ break; ++ } ++ if (bfound < 0) ++ goto out; ++ ++ for (bindex = btop; bindex <= bbot; bindex++) { ++ h_i = au_h_iptr(inode, bindex); ++ if (!h_i) ++ continue; ++ ++ err = au_xino_write(inode->i_sb, bindex, h_i->i_ino, /*ino*/0); ++ /* ignore this error */ ++ /* bad action? */ ++ } ++ ++ /* children inode number will be broken */ ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int hn_gen_tree(struct dentry *dentry) ++{ ++ int err, i, j, ndentry; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry **dentries; ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_dcsub_pages(&dpages, dentry, NULL, NULL); ++ if (unlikely(err)) ++ goto out_dpages; ++ ++ for (i = 0; i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ dentries = dpage->dentries; ++ ndentry = dpage->ndentry; ++ for (j = 0; j < ndentry; j++) { ++ struct dentry *d; ++ ++ d = dentries[j]; ++ if (IS_ROOT(d)) ++ continue; ++ ++ au_digen_dec(d); ++ if (d_really_is_positive(d)) ++ /* todo: reset children xino? ++ cached children only? */ ++ au_iigen_dec(d_inode(d)); ++ } ++ } ++ ++out_dpages: ++ au_dpages_free(&dpages); ++ ++#if 0 ++ /* discard children */ ++ dentry_unhash(dentry); ++ dput(dentry); ++#endif ++out: ++ return err; ++} ++ ++/* ++ * return 0 if processed. ++ */ ++static int hn_gen_by_inode(char *name, unsigned int nlen, struct inode *inode, ++ const unsigned int isdir) ++{ ++ int err; ++ struct dentry *d; ++ struct qstr *dname; ++ ++ err = 1; ++ if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { ++ pr_warn("branch root dir was changed\n"); ++ err = 0; ++ goto out; ++ } ++ ++ if (!isdir) { ++ AuDebugOn(!name); ++ au_iigen_dec(inode); ++ spin_lock(&inode->i_lock); ++ hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias) { ++ spin_lock(&d->d_lock); ++ dname = &d->d_name; ++ if (dname->len != nlen ++ && memcmp(dname->name, name, nlen)) { ++ spin_unlock(&d->d_lock); ++ continue; ++ } ++ err = 0; ++ au_digen_dec(d); ++ spin_unlock(&d->d_lock); ++ break; ++ } ++ spin_unlock(&inode->i_lock); ++ } else { ++ au_fset_si(au_sbi(inode->i_sb), FAILED_REFRESH_DIR); ++ d = d_find_any_alias(inode); ++ if (!d) { ++ au_iigen_dec(inode); ++ goto out; ++ } ++ ++ spin_lock(&d->d_lock); ++ dname = &d->d_name; ++ if (dname->len == nlen && !memcmp(dname->name, name, nlen)) { ++ spin_unlock(&d->d_lock); ++ err = hn_gen_tree(d); ++ spin_lock(&d->d_lock); ++ } ++ spin_unlock(&d->d_lock); ++ dput(d); ++ } ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int hn_gen_by_name(struct dentry *dentry, const unsigned int isdir) ++{ ++ int err; ++ ++ if (IS_ROOT(dentry)) { ++ pr_warn("branch root dir was changed\n"); ++ return 0; ++ } ++ ++ err = 0; ++ if (!isdir) { ++ au_digen_dec(dentry); ++ if (d_really_is_positive(dentry)) ++ au_iigen_dec(d_inode(dentry)); ++ } else { ++ au_fset_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR); ++ if (d_really_is_positive(dentry)) ++ err = hn_gen_tree(dentry); ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* hnotify job flags */ ++#define AuHnJob_XINO0 1 ++#define AuHnJob_GEN (1 << 1) ++#define AuHnJob_DIRENT (1 << 2) ++#define AuHnJob_ISDIR (1 << 3) ++#define AuHnJob_TRYXINO0 (1 << 4) ++#define AuHnJob_MNTPNT (1 << 5) ++#define au_ftest_hnjob(flags, name) ((flags) & AuHnJob_##name) ++#define au_fset_hnjob(flags, name) \ ++ do { (flags) |= AuHnJob_##name; } while (0) ++#define au_fclr_hnjob(flags, name) \ ++ do { (flags) &= ~AuHnJob_##name; } while (0) ++ ++enum { ++ AuHn_CHILD, ++ AuHn_PARENT, ++ AuHnLast ++}; ++ ++struct au_hnotify_args { ++ struct inode *h_dir, *dir, *h_child_inode; ++ u32 mask; ++ unsigned int flags[AuHnLast]; ++ unsigned int h_child_nlen; ++ char h_child_name[]; ++}; ++ ++struct hn_job_args { ++ unsigned int flags; ++ struct inode *inode, *h_inode, *dir, *h_dir; ++ struct dentry *dentry; ++ char *h_name; ++ int h_nlen; ++}; ++ ++static int hn_job(struct hn_job_args *a) ++{ ++ const unsigned int isdir = au_ftest_hnjob(a->flags, ISDIR); ++ int e; ++ ++ /* reset xino */ ++ if (au_ftest_hnjob(a->flags, XINO0) && a->inode) ++ hn_xino(a->inode, a->h_inode); /* ignore this error */ ++ ++ if (au_ftest_hnjob(a->flags, TRYXINO0) ++ && a->inode ++ && a->h_inode) { ++ inode_lock_nested(a->h_inode, AuLsc_I_CHILD); ++ if (!a->h_inode->i_nlink ++ && !(a->h_inode->i_state & I_LINKABLE)) ++ hn_xino(a->inode, a->h_inode); /* ignore this error */ ++ inode_unlock(a->h_inode); ++ } ++ ++ /* make the generation obsolete */ ++ if (au_ftest_hnjob(a->flags, GEN)) { ++ e = -1; ++ if (a->inode) ++ e = hn_gen_by_inode(a->h_name, a->h_nlen, a->inode, ++ isdir); ++ if (e && a->dentry) ++ hn_gen_by_name(a->dentry, isdir); ++ /* ignore this error */ ++ } ++ ++ /* make dir entries obsolete */ ++ if (au_ftest_hnjob(a->flags, DIRENT) && a->inode) { ++ struct au_vdir *vdir; ++ ++ vdir = au_ivdir(a->inode); ++ if (vdir) ++ vdir->vd_jiffy = 0; ++ /* IMustLock(a->inode); */ ++ /* a->inode->i_version++; */ ++ } ++ ++ /* can do nothing but warn */ ++ if (au_ftest_hnjob(a->flags, MNTPNT) ++ && a->dentry ++ && d_mountpoint(a->dentry)) ++ pr_warn("mount-point %pd is removed or renamed\n", a->dentry); ++ ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct dentry *lookup_wlock_by_name(char *name, unsigned int nlen, ++ struct inode *dir) ++{ ++ struct dentry *dentry, *d, *parent; ++ struct qstr *dname; ++ ++ parent = d_find_any_alias(dir); ++ if (!parent) ++ return NULL; ++ ++ dentry = NULL; ++ spin_lock(&parent->d_lock); ++ list_for_each_entry(d, &parent->d_subdirs, d_child) { ++ /* AuDbg("%pd\n", d); */ ++ spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); ++ dname = &d->d_name; ++ if (dname->len != nlen || memcmp(dname->name, name, nlen)) ++ goto cont_unlock; ++ if (au_di(d)) ++ au_digen_dec(d); ++ else ++ goto cont_unlock; ++ if (au_dcount(d) > 0) { ++ dentry = dget_dlock(d); ++ spin_unlock(&d->d_lock); ++ break; ++ } ++ ++cont_unlock: ++ spin_unlock(&d->d_lock); ++ } ++ spin_unlock(&parent->d_lock); ++ dput(parent); ++ ++ if (dentry) ++ di_write_lock_child(dentry); ++ ++ return dentry; ++} ++ ++static struct inode *lookup_wlock_by_ino(struct super_block *sb, ++ aufs_bindex_t bindex, ino_t h_ino) ++{ ++ struct inode *inode; ++ ino_t ino; ++ int err; ++ ++ inode = NULL; ++ err = au_xino_read(sb, bindex, h_ino, &ino); ++ if (!err && ino) ++ inode = ilookup(sb, ino); ++ if (!inode) ++ goto out; ++ ++ if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { ++ pr_warn("wrong root branch\n"); ++ iput(inode); ++ inode = NULL; ++ goto out; ++ } ++ ++ ii_write_lock_child(inode); ++ ++out: ++ return inode; ++} ++ ++static void au_hn_bh(void *_args) ++{ ++ struct au_hnotify_args *a = _args; ++ struct super_block *sb; ++ aufs_bindex_t bindex, bbot, bfound; ++ unsigned char xino, try_iput; ++ int err; ++ struct inode *inode; ++ ino_t h_ino; ++ struct hn_job_args args; ++ struct dentry *dentry; ++ struct au_sbinfo *sbinfo; ++ ++ AuDebugOn(!_args); ++ AuDebugOn(!a->h_dir); ++ AuDebugOn(!a->dir); ++ AuDebugOn(!a->mask); ++ AuDbg("mask 0x%x, i%lu, hi%lu, hci%lu\n", ++ a->mask, a->dir->i_ino, a->h_dir->i_ino, ++ a->h_child_inode ? a->h_child_inode->i_ino : 0); ++ ++ inode = NULL; ++ dentry = NULL; ++ /* ++ * do not lock a->dir->i_mutex here ++ * because of d_revalidate() may cause a deadlock. ++ */ ++ sb = a->dir->i_sb; ++ AuDebugOn(!sb); ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!sbinfo); ++ si_write_lock(sb, AuLock_NOPLMW); ++ ++ ii_read_lock_parent(a->dir); ++ bfound = -1; ++ bbot = au_ibbot(a->dir); ++ for (bindex = au_ibtop(a->dir); bindex <= bbot; bindex++) ++ if (au_h_iptr(a->dir, bindex) == a->h_dir) { ++ bfound = bindex; ++ break; ++ } ++ ii_read_unlock(a->dir); ++ if (unlikely(bfound < 0)) ++ goto out; ++ ++ xino = !!au_opt_test(au_mntflags(sb), XINO); ++ h_ino = 0; ++ if (a->h_child_inode) ++ h_ino = a->h_child_inode->i_ino; ++ ++ if (a->h_child_nlen ++ && (au_ftest_hnjob(a->flags[AuHn_CHILD], GEN) ++ || au_ftest_hnjob(a->flags[AuHn_CHILD], MNTPNT))) ++ dentry = lookup_wlock_by_name(a->h_child_name, a->h_child_nlen, ++ a->dir); ++ try_iput = 0; ++ if (dentry && d_really_is_positive(dentry)) ++ inode = d_inode(dentry); ++ if (xino && !inode && h_ino ++ && (au_ftest_hnjob(a->flags[AuHn_CHILD], XINO0) ++ || au_ftest_hnjob(a->flags[AuHn_CHILD], TRYXINO0) ++ || au_ftest_hnjob(a->flags[AuHn_CHILD], GEN))) { ++ inode = lookup_wlock_by_ino(sb, bfound, h_ino); ++ try_iput = 1; ++ } ++ ++ args.flags = a->flags[AuHn_CHILD]; ++ args.dentry = dentry; ++ args.inode = inode; ++ args.h_inode = a->h_child_inode; ++ args.dir = a->dir; ++ args.h_dir = a->h_dir; ++ args.h_name = a->h_child_name; ++ args.h_nlen = a->h_child_nlen; ++ err = hn_job(&args); ++ if (dentry) { ++ if (au_di(dentry)) ++ di_write_unlock(dentry); ++ dput(dentry); ++ } ++ if (inode && try_iput) { ++ ii_write_unlock(inode); ++ iput(inode); ++ } ++ ++ ii_write_lock_parent(a->dir); ++ args.flags = a->flags[AuHn_PARENT]; ++ args.dentry = NULL; ++ args.inode = a->dir; ++ args.h_inode = a->h_dir; ++ args.dir = NULL; ++ args.h_dir = NULL; ++ args.h_name = NULL; ++ args.h_nlen = 0; ++ err = hn_job(&args); ++ ii_write_unlock(a->dir); ++ ++out: ++ iput(a->h_child_inode); ++ iput(a->h_dir); ++ iput(a->dir); ++ si_write_unlock(sb); ++ au_nwt_done(&sbinfo->si_nowait); ++ kfree(a); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask, ++ struct qstr *h_child_qstr, struct inode *h_child_inode) ++{ ++ int err, len; ++ unsigned int flags[AuHnLast], f; ++ unsigned char isdir, isroot, wh; ++ struct inode *dir; ++ struct au_hnotify_args *args; ++ char *p, *h_child_name; ++ ++ err = 0; ++ AuDebugOn(!hnotify || !hnotify->hn_aufs_inode); ++ dir = igrab(hnotify->hn_aufs_inode); ++ if (!dir) ++ goto out; ++ ++ isroot = (dir->i_ino == AUFS_ROOT_INO); ++ wh = 0; ++ h_child_name = (void *)h_child_qstr->name; ++ len = h_child_qstr->len; ++ if (h_child_name) { ++ if (len > AUFS_WH_PFX_LEN ++ && !memcmp(h_child_name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { ++ h_child_name += AUFS_WH_PFX_LEN; ++ len -= AUFS_WH_PFX_LEN; ++ wh = 1; ++ } ++ } ++ ++ isdir = 0; ++ if (h_child_inode) ++ isdir = !!S_ISDIR(h_child_inode->i_mode); ++ flags[AuHn_PARENT] = AuHnJob_ISDIR; ++ flags[AuHn_CHILD] = 0; ++ if (isdir) ++ flags[AuHn_CHILD] = AuHnJob_ISDIR; ++ au_fset_hnjob(flags[AuHn_PARENT], DIRENT); ++ au_fset_hnjob(flags[AuHn_CHILD], GEN); ++ switch (mask & FS_EVENTS_POSS_ON_CHILD) { ++ case FS_MOVED_FROM: ++ case FS_MOVED_TO: ++ au_fset_hnjob(flags[AuHn_CHILD], XINO0); ++ au_fset_hnjob(flags[AuHn_CHILD], MNTPNT); ++ /*FALLTHROUGH*/ ++ case FS_CREATE: ++ AuDebugOn(!h_child_name); ++ break; ++ ++ case FS_DELETE: ++ /* ++ * aufs never be able to get this child inode. ++ * revalidation should be in d_revalidate() ++ * by checking i_nlink, i_generation or d_unhashed(). ++ */ ++ AuDebugOn(!h_child_name); ++ au_fset_hnjob(flags[AuHn_CHILD], TRYXINO0); ++ au_fset_hnjob(flags[AuHn_CHILD], MNTPNT); ++ break; ++ ++ default: ++ AuDebugOn(1); ++ } ++ ++ if (wh) ++ h_child_inode = NULL; ++ ++ err = -ENOMEM; ++ /* iput() and kfree() will be called in au_hnotify() */ ++ args = kmalloc(sizeof(*args) + len + 1, GFP_NOFS); ++ if (unlikely(!args)) { ++ AuErr1("no memory\n"); ++ iput(dir); ++ goto out; ++ } ++ args->flags[AuHn_PARENT] = flags[AuHn_PARENT]; ++ args->flags[AuHn_CHILD] = flags[AuHn_CHILD]; ++ args->mask = mask; ++ args->dir = dir; ++ args->h_dir = igrab(h_dir); ++ if (h_child_inode) ++ h_child_inode = igrab(h_child_inode); /* can be NULL */ ++ args->h_child_inode = h_child_inode; ++ args->h_child_nlen = len; ++ if (len) { ++ p = (void *)args; ++ p += sizeof(*args); ++ memcpy(p, h_child_name, len); ++ p[len] = 0; ++ } ++ ++ /* NFS fires the event for silly-renamed one from kworker */ ++ f = 0; ++ if (!dir->i_nlink ++ || (au_test_nfs(h_dir->i_sb) && (mask & FS_DELETE))) ++ f = AuWkq_NEST; ++ err = au_wkq_nowait(au_hn_bh, args, dir->i_sb, f); ++ if (unlikely(err)) { ++ pr_err("wkq %d\n", err); ++ iput(args->h_child_inode); ++ iput(args->h_dir); ++ iput(args->dir); ++ kfree(args); ++ } ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm) ++{ ++ int err; ++ ++ AuDebugOn(!(udba & AuOptMask_UDBA)); ++ ++ err = 0; ++ if (au_hnotify_op.reset_br) ++ err = au_hnotify_op.reset_br(udba, br, perm); ++ ++ return err; ++} ++ ++int au_hnotify_init_br(struct au_branch *br, int perm) ++{ ++ int err; ++ ++ err = 0; ++ if (au_hnotify_op.init_br) ++ err = au_hnotify_op.init_br(br, perm); ++ ++ return err; ++} ++ ++void au_hnotify_fin_br(struct au_branch *br) ++{ ++ if (au_hnotify_op.fin_br) ++ au_hnotify_op.fin_br(br); ++} ++ ++static void au_hn_destroy_cache(void) ++{ ++ kmem_cache_destroy(au_cachep[AuCache_HNOTIFY]); ++ au_cachep[AuCache_HNOTIFY] = NULL; ++} ++ ++int __init au_hnotify_init(void) ++{ ++ int err; ++ ++ err = -ENOMEM; ++ au_cachep[AuCache_HNOTIFY] = AuCache(au_hnotify); ++ if (au_cachep[AuCache_HNOTIFY]) { ++ err = 0; ++ if (au_hnotify_op.init) ++ err = au_hnotify_op.init(); ++ if (unlikely(err)) ++ au_hn_destroy_cache(); ++ } ++ AuTraceErr(err); ++ return err; ++} ++ ++void au_hnotify_fin(void) ++{ ++ if (au_hnotify_op.fin) ++ au_hnotify_op.fin(); ++ /* cf. au_cache_fin() */ ++ if (au_cachep[AuCache_HNOTIFY]) ++ au_hn_destroy_cache(); ++} +diff --git a/fs/aufs/i_op.c b/fs/aufs/i_op.c +new file mode 100644 +index 0000000..6f3fedb +--- /dev/null ++++ b/fs/aufs/i_op.c +@@ -0,0 +1,1407 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * inode operations (except add/del/rename) ++ */ ++ ++#include ++#include ++#include ++#include ++#include "aufs.h" ++ ++static int h_permission(struct inode *h_inode, int mask, ++ struct path *h_path, int brperm) ++{ ++ int err; ++ const unsigned char write_mask = !!(mask & (MAY_WRITE | MAY_APPEND)); ++ ++ err = -EACCES; ++ if ((write_mask && IS_IMMUTABLE(h_inode)) ++ || ((mask & MAY_EXEC) ++ && S_ISREG(h_inode->i_mode) ++ && (path_noexec(h_path) ++ || !(h_inode->i_mode & S_IXUGO)))) ++ goto out; ++ ++ /* ++ * - skip the lower fs test in the case of write to ro branch. ++ * - nfs dir permission write check is optimized, but a policy for ++ * link/rename requires a real check. ++ * - nfs always sets MS_POSIXACL regardless its mount option 'noacl.' ++ * in this case, generic_permission() returns -EOPNOTSUPP. ++ */ ++ if ((write_mask && !au_br_writable(brperm)) ++ || (au_test_nfs(h_inode->i_sb) && S_ISDIR(h_inode->i_mode) ++ && write_mask && !(mask & MAY_READ)) ++ || !h_inode->i_op->permission) { ++ /* AuLabel(generic_permission); */ ++ /* AuDbg("get_acl %pf\n", h_inode->i_op->get_acl); */ ++ err = generic_permission(h_inode, mask); ++ if (err == -EOPNOTSUPP && au_test_nfs_noacl(h_inode)) ++ err = h_inode->i_op->permission(h_inode, mask); ++ AuTraceErr(err); ++ } else { ++ /* AuLabel(h_inode->permission); */ ++ err = h_inode->i_op->permission(h_inode, mask); ++ AuTraceErr(err); ++ } ++ ++ if (!err) ++ err = devcgroup_inode_permission(h_inode, mask); ++ if (!err) ++ err = security_inode_permission(h_inode, mask); ++ ++#if 0 ++ if (!err) { ++ /* todo: do we need to call ima_path_check()? */ ++ struct path h_path = { ++ .dentry = ++ .mnt = h_mnt ++ }; ++ err = ima_path_check(&h_path, ++ mask & (MAY_READ | MAY_WRITE | MAY_EXEC), ++ IMA_COUNT_LEAVE); ++ } ++#endif ++ ++out: ++ return err; ++} ++ ++static int aufs_permission(struct inode *inode, int mask) ++{ ++ int err; ++ aufs_bindex_t bindex, bbot; ++ const unsigned char isdir = !!S_ISDIR(inode->i_mode), ++ write_mask = !!(mask & (MAY_WRITE | MAY_APPEND)); ++ struct inode *h_inode; ++ struct super_block *sb; ++ struct au_branch *br; ++ ++ /* todo: support rcu-walk? */ ++ if (mask & MAY_NOT_BLOCK) ++ return -ECHILD; ++ ++ sb = inode->i_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ ii_read_lock_child(inode); ++#if 0 ++ err = au_iigen_test(inode, au_sigen(sb)); ++ if (unlikely(err)) ++ goto out; ++#endif ++ ++ if (!isdir ++ || write_mask ++ || au_opt_test(au_mntflags(sb), DIRPERM1)) { ++ err = au_busy_or_stale(); ++ h_inode = au_h_iptr(inode, au_ibtop(inode)); ++ if (unlikely(!h_inode ++ || (h_inode->i_mode & S_IFMT) ++ != (inode->i_mode & S_IFMT))) ++ goto out; ++ ++ err = 0; ++ bindex = au_ibtop(inode); ++ br = au_sbr(sb, bindex); ++ err = h_permission(h_inode, mask, &br->br_path, br->br_perm); ++ if (write_mask ++ && !err ++ && !special_file(h_inode->i_mode)) { ++ /* test whether the upper writable branch exists */ ++ err = -EROFS; ++ for (; bindex >= 0; bindex--) ++ if (!au_br_rdonly(au_sbr(sb, bindex))) { ++ err = 0; ++ break; ++ } ++ } ++ goto out; ++ } ++ ++ /* non-write to dir */ ++ err = 0; ++ bbot = au_ibbot(inode); ++ for (bindex = au_ibtop(inode); !err && bindex <= bbot; bindex++) { ++ h_inode = au_h_iptr(inode, bindex); ++ if (h_inode) { ++ err = au_busy_or_stale(); ++ if (unlikely(!S_ISDIR(h_inode->i_mode))) ++ break; ++ ++ br = au_sbr(sb, bindex); ++ err = h_permission(h_inode, mask, &br->br_path, ++ br->br_perm); ++ } ++ } ++ ++out: ++ ii_read_unlock(inode); ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct dentry *aufs_lookup(struct inode *dir, struct dentry *dentry, ++ unsigned int flags) ++{ ++ struct dentry *ret, *parent; ++ struct inode *inode; ++ struct super_block *sb; ++ int err, npositive; ++ ++ IMustLock(dir); ++ ++ /* todo: support rcu-walk? */ ++ ret = ERR_PTR(-ECHILD); ++ if (flags & LOOKUP_RCU) ++ goto out; ++ ++ ret = ERR_PTR(-ENAMETOOLONG); ++ if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN)) ++ goto out; ++ ++ sb = dir->i_sb; ++ err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ ret = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_di_init(dentry); ++ ret = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_si; ++ ++ inode = NULL; ++ npositive = 0; /* suppress a warning */ ++ parent = dentry->d_parent; /* dir inode is locked */ ++ di_read_lock_parent(parent, AuLock_IR); ++ err = au_alive_dir(parent); ++ if (!err) ++ err = au_digen_test(parent, au_sigen(sb)); ++ if (!err) { ++ npositive = au_lkup_dentry(dentry, au_dbtop(parent), ++ /*type*/0); ++ err = npositive; ++ } ++ di_read_unlock(parent, AuLock_IR); ++ ret = ERR_PTR(err); ++ if (unlikely(err < 0)) ++ goto out_unlock; ++ ++ if (npositive) { ++ inode = au_new_inode(dentry, /*must_new*/0); ++ if (IS_ERR(inode)) { ++ ret = (void *)inode; ++ inode = NULL; ++ goto out_unlock; ++ } ++ } ++ ++ if (inode) ++ atomic_inc(&inode->i_count); ++ ret = d_splice_alias(inode, dentry); ++#if 0 ++ if (unlikely(d_need_lookup(dentry))) { ++ spin_lock(&dentry->d_lock); ++ dentry->d_flags &= ~DCACHE_NEED_LOOKUP; ++ spin_unlock(&dentry->d_lock); ++ } else ++#endif ++ if (inode) { ++ if (!IS_ERR(ret)) { ++ iput(inode); ++ if (ret && ret != dentry) ++ ii_write_unlock(inode); ++ } else { ++ ii_write_unlock(inode); ++ iput(inode); ++ inode = NULL; ++ } ++ } ++ ++out_unlock: ++ di_write_unlock(dentry); ++out_si: ++ si_read_unlock(sb); ++out: ++ return ret; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct aopen_node { ++ struct hlist_node hlist; ++ struct file *file, *h_file; ++}; ++ ++static int au_do_aopen(struct inode *inode, struct file *file) ++{ ++ struct au_sphlhead *aopen; ++ struct aopen_node *node; ++ struct au_do_open_args args = { ++ .no_lock = 1, ++ .open = au_do_open_nondir ++ }; ++ ++ aopen = &au_sbi(inode->i_sb)->si_aopen; ++ spin_lock(&aopen->spin); ++ hlist_for_each_entry(node, &aopen->head, hlist) ++ if (node->file == file) { ++ args.h_file = node->h_file; ++ break; ++ } ++ spin_unlock(&aopen->spin); ++ /* AuDebugOn(!args.h_file); */ ++ ++ return au_do_open(file, &args); ++} ++ ++static int aufs_atomic_open(struct inode *dir, struct dentry *dentry, ++ struct file *file, unsigned int open_flag, ++ umode_t create_mode, int *opened) ++{ ++ int err, h_opened = *opened; ++ struct dentry *parent; ++ struct dentry *d; ++ struct au_sphlhead *aopen; ++ struct vfsub_aopen_args args = { ++ .open_flag = open_flag, ++ .create_mode = create_mode, ++ .opened = &h_opened ++ }; ++ struct aopen_node aopen_node = { ++ .file = file ++ }; ++ ++ IMustLock(dir); ++ AuDbg("open_flag 0x%x\n", open_flag); ++ AuDbgDentry(dentry); ++ ++ err = 0; ++ if (!au_di(dentry)) { ++ d = aufs_lookup(dir, dentry, /*flags*/0); ++ if (IS_ERR(d)) { ++ err = PTR_ERR(d); ++ goto out; ++ } else if (d) { ++ /* ++ * obsoleted dentry found. ++ * another error will be returned later. ++ */ ++ d_drop(d); ++ dput(d); ++ AuDbgDentry(d); ++ } ++ AuDbgDentry(dentry); ++ } ++ ++ if (d_is_positive(dentry) ++ || d_unhashed(dentry) ++ || d_unlinked(dentry) ++ || !(open_flag & O_CREAT)) ++ goto out_no_open; ++ ++ err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_GEN); ++ if (unlikely(err)) ++ goto out; ++ ++ parent = dentry->d_parent; /* dir is locked */ ++ di_write_lock_parent(parent); ++ err = au_lkup_dentry(dentry, /*btop*/0, /*type*/0); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ AuDbgDentry(dentry); ++ if (d_is_positive(dentry)) ++ goto out_unlock; ++ ++ args.file = get_empty_filp(); ++ err = PTR_ERR(args.file); ++ if (IS_ERR(args.file)) ++ goto out_unlock; ++ ++ args.file->f_flags = file->f_flags; ++ err = au_aopen_or_create(dir, dentry, &args); ++ AuTraceErr(err); ++ AuDbgFile(args.file); ++ if (unlikely(err < 0)) { ++ if (h_opened & FILE_OPENED) ++ fput(args.file); ++ else ++ put_filp(args.file); ++ goto out_unlock; ++ } ++ ++ /* some filesystems don't set FILE_CREATED while succeeded? */ ++ *opened |= FILE_CREATED; ++ if (h_opened & FILE_OPENED) ++ aopen_node.h_file = args.file; ++ else { ++ put_filp(args.file); ++ args.file = NULL; ++ } ++ aopen = &au_sbi(dir->i_sb)->si_aopen; ++ au_sphl_add(&aopen_node.hlist, aopen); ++ err = finish_open(file, dentry, au_do_aopen, opened); ++ au_sphl_del(&aopen_node.hlist, aopen); ++ AuTraceErr(err); ++ AuDbgFile(file); ++ if (aopen_node.h_file) ++ fput(aopen_node.h_file); ++ ++out_unlock: ++ di_write_unlock(parent); ++ aufs_read_unlock(dentry, AuLock_DW); ++ AuDbgDentry(dentry); ++ if (unlikely(err)) ++ goto out; ++out_no_open: ++ if (!err && !(*opened & FILE_CREATED)) { ++ AuLabel(out_no_open); ++ dget(dentry); ++ err = finish_no_open(file, dentry); ++ } ++out: ++ AuDbg("%pd%s%s\n", dentry, ++ (*opened & FILE_CREATED) ? " created" : "", ++ (*opened & FILE_OPENED) ? " opened" : ""); ++ AuTraceErr(err); ++ return err; ++} ++ ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_wr_dir_cpup(struct dentry *dentry, struct dentry *parent, ++ const unsigned char add_entry, aufs_bindex_t bcpup, ++ aufs_bindex_t btop) ++{ ++ int err; ++ struct dentry *h_parent; ++ struct inode *h_dir; ++ ++ if (add_entry) ++ IMustLock(d_inode(parent)); ++ else ++ di_write_lock_parent(parent); ++ ++ err = 0; ++ if (!au_h_dptr(parent, bcpup)) { ++ if (btop > bcpup) ++ err = au_cpup_dirs(dentry, bcpup); ++ else if (btop < bcpup) ++ err = au_cpdown_dirs(dentry, bcpup); ++ else ++ BUG(); ++ } ++ if (!err && add_entry && !au_ftest_wrdir(add_entry, TMPFILE)) { ++ h_parent = au_h_dptr(parent, bcpup); ++ h_dir = d_inode(h_parent); ++ inode_lock_nested(h_dir, AuLsc_I_PARENT); ++ err = au_lkup_neg(dentry, bcpup, /*wh*/0); ++ /* todo: no unlock here */ ++ inode_unlock(h_dir); ++ ++ AuDbg("bcpup %d\n", bcpup); ++ if (!err) { ++ if (d_really_is_negative(dentry)) ++ au_set_h_dptr(dentry, btop, NULL); ++ au_update_dbrange(dentry, /*do_put_zero*/0); ++ } ++ } ++ ++ if (!add_entry) ++ di_write_unlock(parent); ++ if (!err) ++ err = bcpup; /* success */ ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ++ * decide the branch and the parent dir where we will create a new entry. ++ * returns new bindex or an error. ++ * copyup the parent dir if needed. ++ */ ++int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry, ++ struct au_wr_dir_args *args) ++{ ++ int err; ++ unsigned int flags; ++ aufs_bindex_t bcpup, btop, src_btop; ++ const unsigned char add_entry ++ = au_ftest_wrdir(args->flags, ADD_ENTRY) ++ | au_ftest_wrdir(args->flags, TMPFILE); ++ struct super_block *sb; ++ struct dentry *parent; ++ struct au_sbinfo *sbinfo; ++ ++ sb = dentry->d_sb; ++ sbinfo = au_sbi(sb); ++ parent = dget_parent(dentry); ++ btop = au_dbtop(dentry); ++ bcpup = btop; ++ if (args->force_btgt < 0) { ++ if (src_dentry) { ++ src_btop = au_dbtop(src_dentry); ++ if (src_btop < btop) ++ bcpup = src_btop; ++ } else if (add_entry) { ++ flags = 0; ++ if (au_ftest_wrdir(args->flags, ISDIR)) ++ au_fset_wbr(flags, DIR); ++ err = AuWbrCreate(sbinfo, dentry, flags); ++ bcpup = err; ++ } ++ ++ if (bcpup < 0 || au_test_ro(sb, bcpup, d_inode(dentry))) { ++ if (add_entry) ++ err = AuWbrCopyup(sbinfo, dentry); ++ else { ++ if (!IS_ROOT(dentry)) { ++ di_read_lock_parent(parent, !AuLock_IR); ++ err = AuWbrCopyup(sbinfo, dentry); ++ di_read_unlock(parent, !AuLock_IR); ++ } else ++ err = AuWbrCopyup(sbinfo, dentry); ++ } ++ bcpup = err; ++ if (unlikely(err < 0)) ++ goto out; ++ } ++ } else { ++ bcpup = args->force_btgt; ++ AuDebugOn(au_test_ro(sb, bcpup, d_inode(dentry))); ++ } ++ ++ AuDbg("btop %d, bcpup %d\n", btop, bcpup); ++ err = bcpup; ++ if (bcpup == btop) ++ goto out; /* success */ ++ ++ /* copyup the new parent into the branch we process */ ++ err = au_wr_dir_cpup(dentry, parent, add_entry, bcpup, btop); ++ if (err >= 0) { ++ if (d_really_is_negative(dentry)) { ++ au_set_h_dptr(dentry, btop, NULL); ++ au_set_dbtop(dentry, bcpup); ++ au_set_dbbot(dentry, bcpup); ++ } ++ AuDebugOn(add_entry ++ && !au_ftest_wrdir(args->flags, TMPFILE) ++ && !au_h_dptr(dentry, bcpup)); ++ } ++ ++out: ++ dput(parent); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_pin_hdir_unlock(struct au_pin *p) ++{ ++ if (p->hdir) ++ au_hn_imtx_unlock(p->hdir); ++} ++ ++int au_pin_hdir_lock(struct au_pin *p) ++{ ++ int err; ++ ++ err = 0; ++ if (!p->hdir) ++ goto out; ++ ++ /* even if an error happens later, keep this lock */ ++ au_hn_imtx_lock_nested(p->hdir, p->lsc_hi); ++ ++ err = -EBUSY; ++ if (unlikely(p->hdir->hi_inode != d_inode(p->h_parent))) ++ goto out; ++ ++ err = 0; ++ if (p->h_dentry) ++ err = au_h_verify(p->h_dentry, p->udba, p->hdir->hi_inode, ++ p->h_parent, p->br); ++ ++out: ++ return err; ++} ++ ++int au_pin_hdir_relock(struct au_pin *p) ++{ ++ int err, i; ++ struct inode *h_i; ++ struct dentry *h_d[] = { ++ p->h_dentry, ++ p->h_parent ++ }; ++ ++ err = au_pin_hdir_lock(p); ++ if (unlikely(err)) ++ goto out; ++ ++ for (i = 0; !err && i < sizeof(h_d)/sizeof(*h_d); i++) { ++ if (!h_d[i]) ++ continue; ++ if (d_is_positive(h_d[i])) { ++ h_i = d_inode(h_d[i]); ++ err = !h_i->i_nlink; ++ } ++ } ++ ++out: ++ return err; ++} ++ ++void au_pin_hdir_set_owner(struct au_pin *p, struct task_struct *task) ++{ ++#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) ++ p->hdir->hi_inode->i_mutex.owner = task; ++#endif ++} ++ ++void au_pin_hdir_acquire_nest(struct au_pin *p) ++{ ++ if (p->hdir) { ++ mutex_acquire_nest(&p->hdir->hi_inode->i_mutex.dep_map, ++ p->lsc_hi, 0, NULL, _RET_IP_); ++ au_pin_hdir_set_owner(p, current); ++ } ++} ++ ++void au_pin_hdir_release(struct au_pin *p) ++{ ++ if (p->hdir) { ++ au_pin_hdir_set_owner(p, p->task); ++ mutex_release(&p->hdir->hi_inode->i_mutex.dep_map, 1, _RET_IP_); ++ } ++} ++ ++struct dentry *au_pinned_h_parent(struct au_pin *pin) ++{ ++ if (pin && pin->parent) ++ return au_h_dptr(pin->parent, pin->bindex); ++ return NULL; ++} ++ ++void au_unpin(struct au_pin *p) ++{ ++ if (p->hdir) ++ au_pin_hdir_unlock(p); ++ if (p->h_mnt && au_ftest_pin(p->flags, MNT_WRITE)) ++ vfsub_mnt_drop_write(p->h_mnt); ++ if (!p->hdir) ++ return; ++ ++ if (!au_ftest_pin(p->flags, DI_LOCKED)) ++ di_read_unlock(p->parent, AuLock_IR); ++ iput(p->hdir->hi_inode); ++ dput(p->parent); ++ p->parent = NULL; ++ p->hdir = NULL; ++ p->h_mnt = NULL; ++ /* do not clear p->task */ ++} ++ ++int au_do_pin(struct au_pin *p) ++{ ++ int err; ++ struct super_block *sb; ++ struct inode *h_dir; ++ ++ err = 0; ++ sb = p->dentry->d_sb; ++ p->br = au_sbr(sb, p->bindex); ++ if (IS_ROOT(p->dentry)) { ++ if (au_ftest_pin(p->flags, MNT_WRITE)) { ++ p->h_mnt = au_br_mnt(p->br); ++ err = vfsub_mnt_want_write(p->h_mnt); ++ if (unlikely(err)) { ++ au_fclr_pin(p->flags, MNT_WRITE); ++ goto out_err; ++ } ++ } ++ goto out; ++ } ++ ++ p->h_dentry = NULL; ++ if (p->bindex <= au_dbbot(p->dentry)) ++ p->h_dentry = au_h_dptr(p->dentry, p->bindex); ++ ++ p->parent = dget_parent(p->dentry); ++ if (!au_ftest_pin(p->flags, DI_LOCKED)) ++ di_read_lock(p->parent, AuLock_IR, p->lsc_di); ++ ++ h_dir = NULL; ++ p->h_parent = au_h_dptr(p->parent, p->bindex); ++ p->hdir = au_hi(d_inode(p->parent), p->bindex); ++ if (p->hdir) ++ h_dir = p->hdir->hi_inode; ++ ++ /* ++ * udba case, or ++ * if DI_LOCKED is not set, then p->parent may be different ++ * and h_parent can be NULL. ++ */ ++ if (unlikely(!p->hdir || !h_dir || !p->h_parent)) { ++ err = -EBUSY; ++ if (!au_ftest_pin(p->flags, DI_LOCKED)) ++ di_read_unlock(p->parent, AuLock_IR); ++ dput(p->parent); ++ p->parent = NULL; ++ goto out_err; ++ } ++ ++ if (au_ftest_pin(p->flags, MNT_WRITE)) { ++ p->h_mnt = au_br_mnt(p->br); ++ err = vfsub_mnt_want_write(p->h_mnt); ++ if (unlikely(err)) { ++ au_fclr_pin(p->flags, MNT_WRITE); ++ if (!au_ftest_pin(p->flags, DI_LOCKED)) ++ di_read_unlock(p->parent, AuLock_IR); ++ dput(p->parent); ++ p->parent = NULL; ++ goto out_err; ++ } ++ } ++ ++ au_igrab(h_dir); ++ err = au_pin_hdir_lock(p); ++ if (!err) ++ goto out; /* success */ ++ ++ au_unpin(p); ++ ++out_err: ++ pr_err("err %d\n", err); ++ err = au_busy_or_stale(); ++out: ++ return err; ++} ++ ++void au_pin_init(struct au_pin *p, struct dentry *dentry, ++ aufs_bindex_t bindex, int lsc_di, int lsc_hi, ++ unsigned int udba, unsigned char flags) ++{ ++ p->dentry = dentry; ++ p->udba = udba; ++ p->lsc_di = lsc_di; ++ p->lsc_hi = lsc_hi; ++ p->flags = flags; ++ p->bindex = bindex; ++ ++ p->parent = NULL; ++ p->hdir = NULL; ++ p->h_mnt = NULL; ++ ++ p->h_dentry = NULL; ++ p->h_parent = NULL; ++ p->br = NULL; ++ p->task = current; ++} ++ ++int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int udba, unsigned char flags) ++{ ++ au_pin_init(pin, dentry, bindex, AuLsc_DI_PARENT, AuLsc_I_PARENT2, ++ udba, flags); ++ return au_do_pin(pin); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * ->setattr() and ->getattr() are called in various cases. ++ * chmod, stat: dentry is revalidated. ++ * fchmod, fstat: file and dentry are not revalidated, additionally they may be ++ * unhashed. ++ * for ->setattr(), ia->ia_file is passed from ftruncate only. ++ */ ++/* todo: consolidate with do_refresh() and simple_reval_dpath() */ ++int au_reval_for_attr(struct dentry *dentry, unsigned int sigen) ++{ ++ int err; ++ struct dentry *parent; ++ ++ err = 0; ++ if (au_digen_test(dentry, sigen)) { ++ parent = dget_parent(dentry); ++ di_read_lock_parent(parent, AuLock_IR); ++ err = au_refresh_dentry(dentry, parent); ++ di_read_unlock(parent, AuLock_IR); ++ dput(parent); ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++int au_pin_and_icpup(struct dentry *dentry, struct iattr *ia, ++ struct au_icpup_args *a) ++{ ++ int err; ++ loff_t sz; ++ aufs_bindex_t btop, ibtop; ++ struct dentry *hi_wh, *parent; ++ struct inode *inode; ++ struct au_wr_dir_args wr_dir_args = { ++ .force_btgt = -1, ++ .flags = 0 ++ }; ++ ++ if (d_is_dir(dentry)) ++ au_fset_wrdir(wr_dir_args.flags, ISDIR); ++ /* plink or hi_wh() case */ ++ btop = au_dbtop(dentry); ++ inode = d_inode(dentry); ++ ibtop = au_ibtop(inode); ++ if (btop != ibtop && !au_test_ro(inode->i_sb, ibtop, inode)) ++ wr_dir_args.force_btgt = ibtop; ++ err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args); ++ if (unlikely(err < 0)) ++ goto out; ++ a->btgt = err; ++ if (err != btop) ++ au_fset_icpup(a->flags, DID_CPUP); ++ ++ err = 0; ++ a->pin_flags = AuPin_MNT_WRITE; ++ parent = NULL; ++ if (!IS_ROOT(dentry)) { ++ au_fset_pin(a->pin_flags, DI_LOCKED); ++ parent = dget_parent(dentry); ++ di_write_lock_parent(parent); ++ } ++ ++ err = au_pin(&a->pin, dentry, a->btgt, a->udba, a->pin_flags); ++ if (unlikely(err)) ++ goto out_parent; ++ ++ sz = -1; ++ a->h_path.dentry = au_h_dptr(dentry, btop); ++ a->h_inode = d_inode(a->h_path.dentry); ++ if (ia && (ia->ia_valid & ATTR_SIZE)) { ++ inode_lock_nested(a->h_inode, AuLsc_I_CHILD); ++ if (ia->ia_size < i_size_read(a->h_inode)) ++ sz = ia->ia_size; ++ inode_unlock(a->h_inode); ++ } ++ ++ hi_wh = NULL; ++ if (au_ftest_icpup(a->flags, DID_CPUP) && d_unlinked(dentry)) { ++ hi_wh = au_hi_wh(inode, a->btgt); ++ if (!hi_wh) { ++ struct au_cp_generic cpg = { ++ .dentry = dentry, ++ .bdst = a->btgt, ++ .bsrc = -1, ++ .len = sz, ++ .pin = &a->pin ++ }; ++ err = au_sio_cpup_wh(&cpg, /*file*/NULL); ++ if (unlikely(err)) ++ goto out_unlock; ++ hi_wh = au_hi_wh(inode, a->btgt); ++ /* todo: revalidate hi_wh? */ ++ } ++ } ++ ++ if (parent) { ++ au_pin_set_parent_lflag(&a->pin, /*lflag*/0); ++ di_downgrade_lock(parent, AuLock_IR); ++ dput(parent); ++ parent = NULL; ++ } ++ if (!au_ftest_icpup(a->flags, DID_CPUP)) ++ goto out; /* success */ ++ ++ if (!d_unhashed(dentry)) { ++ struct au_cp_generic cpg = { ++ .dentry = dentry, ++ .bdst = a->btgt, ++ .bsrc = btop, ++ .len = sz, ++ .pin = &a->pin, ++ .flags = AuCpup_DTIME | AuCpup_HOPEN ++ }; ++ err = au_sio_cpup_simple(&cpg); ++ if (!err) ++ a->h_path.dentry = au_h_dptr(dentry, a->btgt); ++ } else if (!hi_wh) ++ a->h_path.dentry = au_h_dptr(dentry, a->btgt); ++ else ++ a->h_path.dentry = hi_wh; /* do not dget here */ ++ ++out_unlock: ++ a->h_inode = d_inode(a->h_path.dentry); ++ if (!err) ++ goto out; /* success */ ++ au_unpin(&a->pin); ++out_parent: ++ if (parent) { ++ di_write_unlock(parent); ++ dput(parent); ++ } ++out: ++ if (!err) ++ inode_lock_nested(a->h_inode, AuLsc_I_CHILD); ++ return err; ++} ++ ++static int aufs_setattr(struct dentry *dentry, struct iattr *ia) ++{ ++ int err; ++ struct inode *inode, *delegated; ++ struct super_block *sb; ++ struct file *file; ++ struct au_icpup_args *a; ++ ++ inode = d_inode(dentry); ++ IMustLock(inode); ++ ++ err = -ENOMEM; ++ a = kzalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) ++ ia->ia_valid &= ~ATTR_MODE; ++ ++ file = NULL; ++ sb = dentry->d_sb; ++ err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (unlikely(err)) ++ goto out_kfree; ++ ++ if (ia->ia_valid & ATTR_FILE) { ++ /* currently ftruncate(2) only */ ++ AuDebugOn(!d_is_reg(dentry)); ++ file = ia->ia_file; ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out_si; ++ ia->ia_file = au_hf_top(file); ++ a->udba = AuOpt_UDBA_NONE; ++ } else { ++ /* fchmod() doesn't pass ia_file */ ++ a->udba = au_opt_udba(sb); ++ di_write_lock_child(dentry); ++ /* no d_unlinked(), to set UDBA_NONE for root */ ++ if (d_unhashed(dentry)) ++ a->udba = AuOpt_UDBA_NONE; ++ if (a->udba != AuOpt_UDBA_NONE) { ++ AuDebugOn(IS_ROOT(dentry)); ++ err = au_reval_for_attr(dentry, au_sigen(sb)); ++ if (unlikely(err)) ++ goto out_dentry; ++ } ++ } ++ ++ err = au_pin_and_icpup(dentry, ia, a); ++ if (unlikely(err < 0)) ++ goto out_dentry; ++ if (au_ftest_icpup(a->flags, DID_CPUP)) { ++ ia->ia_file = NULL; ++ ia->ia_valid &= ~ATTR_FILE; ++ } ++ ++ a->h_path.mnt = au_sbr_mnt(sb, a->btgt); ++ if ((ia->ia_valid & (ATTR_MODE | ATTR_CTIME)) ++ == (ATTR_MODE | ATTR_CTIME)) { ++ err = security_path_chmod(&a->h_path, ia->ia_mode); ++ if (unlikely(err)) ++ goto out_unlock; ++ } else if ((ia->ia_valid & (ATTR_UID | ATTR_GID)) ++ && (ia->ia_valid & ATTR_CTIME)) { ++ err = security_path_chown(&a->h_path, ia->ia_uid, ia->ia_gid); ++ if (unlikely(err)) ++ goto out_unlock; ++ } ++ ++ if (ia->ia_valid & ATTR_SIZE) { ++ struct file *f; ++ ++ if (ia->ia_size < i_size_read(inode)) ++ /* unmap only */ ++ truncate_setsize(inode, ia->ia_size); ++ ++ f = NULL; ++ if (ia->ia_valid & ATTR_FILE) ++ f = ia->ia_file; ++ inode_unlock(a->h_inode); ++ err = vfsub_trunc(&a->h_path, ia->ia_size, ia->ia_valid, f); ++ inode_lock_nested(a->h_inode, AuLsc_I_CHILD); ++ } else { ++ delegated = NULL; ++ while (1) { ++ err = vfsub_notify_change(&a->h_path, ia, &delegated); ++ if (delegated) { ++ err = break_deleg_wait(&delegated); ++ if (!err) ++ continue; ++ } ++ break; ++ } ++ } ++ /* ++ * regardless aufs 'acl' option setting. ++ * why don't all acl-aware fs call this func from their ->setattr()? ++ */ ++ if (!err && (ia->ia_valid & ATTR_MODE)) ++ err = vfsub_acl_chmod(a->h_inode, ia->ia_mode); ++ if (!err) ++ au_cpup_attr_changeable(inode); ++ ++out_unlock: ++ inode_unlock(a->h_inode); ++ au_unpin(&a->pin); ++ if (unlikely(err)) ++ au_update_dbtop(dentry); ++out_dentry: ++ di_write_unlock(dentry); ++ if (file) { ++ fi_write_unlock(file); ++ ia->ia_file = file; ++ ia->ia_valid |= ATTR_FILE; ++ } ++out_si: ++ si_read_unlock(sb); ++out_kfree: ++ kfree(a); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++#if IS_ENABLED(CONFIG_AUFS_XATTR) || IS_ENABLED(CONFIG_FS_POSIX_ACL) ++static int au_h_path_to_set_attr(struct dentry *dentry, ++ struct au_icpup_args *a, struct path *h_path) ++{ ++ int err; ++ struct super_block *sb; ++ ++ sb = dentry->d_sb; ++ a->udba = au_opt_udba(sb); ++ /* no d_unlinked(), to set UDBA_NONE for root */ ++ if (d_unhashed(dentry)) ++ a->udba = AuOpt_UDBA_NONE; ++ if (a->udba != AuOpt_UDBA_NONE) { ++ AuDebugOn(IS_ROOT(dentry)); ++ err = au_reval_for_attr(dentry, au_sigen(sb)); ++ if (unlikely(err)) ++ goto out; ++ } ++ err = au_pin_and_icpup(dentry, /*ia*/NULL, a); ++ if (unlikely(err < 0)) ++ goto out; ++ ++ h_path->dentry = a->h_path.dentry; ++ h_path->mnt = au_sbr_mnt(sb, a->btgt); ++ ++out: ++ return err; ++} ++ ++ssize_t au_srxattr(struct dentry *dentry, struct au_srxattr *arg) ++{ ++ int err; ++ struct path h_path; ++ struct super_block *sb; ++ struct au_icpup_args *a; ++ struct inode *inode, *h_inode; ++ ++ inode = d_inode(dentry); ++ IMustLock(inode); ++ ++ err = -ENOMEM; ++ a = kzalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (unlikely(err)) ++ goto out_kfree; ++ ++ h_path.dentry = NULL; /* silence gcc */ ++ di_write_lock_child(dentry); ++ err = au_h_path_to_set_attr(dentry, a, &h_path); ++ if (unlikely(err)) ++ goto out_di; ++ ++ inode_unlock(a->h_inode); ++ switch (arg->type) { ++ case AU_XATTR_SET: ++ err = vfsub_setxattr(h_path.dentry, ++ arg->u.set.name, arg->u.set.value, ++ arg->u.set.size, arg->u.set.flags); ++ break; ++ case AU_XATTR_REMOVE: ++ err = vfsub_removexattr(h_path.dentry, arg->u.remove.name); ++ break; ++ case AU_ACL_SET: ++ err = -EOPNOTSUPP; ++ h_inode = d_inode(h_path.dentry); ++ if (h_inode->i_op->set_acl) ++ err = h_inode->i_op->set_acl(h_inode, ++ arg->u.acl_set.acl, ++ arg->u.acl_set.type); ++ break; ++ } ++ if (!err) ++ au_cpup_attr_timesizes(inode); ++ ++ au_unpin(&a->pin); ++ if (unlikely(err)) ++ au_update_dbtop(dentry); ++ ++out_di: ++ di_write_unlock(dentry); ++ si_read_unlock(sb); ++out_kfree: ++ kfree(a); ++out: ++ AuTraceErr(err); ++ return err; ++} ++#endif ++ ++static void au_refresh_iattr(struct inode *inode, struct kstat *st, ++ unsigned int nlink) ++{ ++ unsigned int n; ++ ++ inode->i_mode = st->mode; ++ /* don't i_[ug]id_write() here */ ++ inode->i_uid = st->uid; ++ inode->i_gid = st->gid; ++ inode->i_atime = st->atime; ++ inode->i_mtime = st->mtime; ++ inode->i_ctime = st->ctime; ++ ++ au_cpup_attr_nlink(inode, /*force*/0); ++ if (S_ISDIR(inode->i_mode)) { ++ n = inode->i_nlink; ++ n -= nlink; ++ n += st->nlink; ++ smp_mb(); /* for i_nlink */ ++ /* 0 can happen */ ++ set_nlink(inode, n); ++ } ++ ++ spin_lock(&inode->i_lock); ++ inode->i_blocks = st->blocks; ++ i_size_write(inode, st->size); ++ spin_unlock(&inode->i_lock); ++} ++ ++/* ++ * common routine for aufs_getattr() and aufs_getxattr(). ++ * returns zero or negative (an error). ++ * @dentry will be read-locked in success. ++ */ ++int au_h_path_getattr(struct dentry *dentry, int force, struct path *h_path) ++{ ++ int err; ++ unsigned int mnt_flags, sigen; ++ unsigned char udba_none; ++ aufs_bindex_t bindex; ++ struct super_block *sb, *h_sb; ++ struct inode *inode; ++ ++ h_path->mnt = NULL; ++ h_path->dentry = NULL; ++ ++ err = 0; ++ sb = dentry->d_sb; ++ mnt_flags = au_mntflags(sb); ++ udba_none = !!au_opt_test(mnt_flags, UDBA_NONE); ++ ++ /* support fstat(2) */ ++ if (!d_unlinked(dentry) && !udba_none) { ++ sigen = au_sigen(sb); ++ err = au_digen_test(dentry, sigen); ++ if (!err) { ++ di_read_lock_child(dentry, AuLock_IR); ++ err = au_dbrange_test(dentry); ++ if (unlikely(err)) { ++ di_read_unlock(dentry, AuLock_IR); ++ goto out; ++ } ++ } else { ++ AuDebugOn(IS_ROOT(dentry)); ++ di_write_lock_child(dentry); ++ err = au_dbrange_test(dentry); ++ if (!err) ++ err = au_reval_for_attr(dentry, sigen); ++ if (!err) ++ di_downgrade_lock(dentry, AuLock_IR); ++ else { ++ di_write_unlock(dentry); ++ goto out; ++ } ++ } ++ } else ++ di_read_lock_child(dentry, AuLock_IR); ++ ++ inode = d_inode(dentry); ++ bindex = au_ibtop(inode); ++ h_path->mnt = au_sbr_mnt(sb, bindex); ++ h_sb = h_path->mnt->mnt_sb; ++ if (!force ++ && !au_test_fs_bad_iattr(h_sb) ++ && udba_none) ++ goto out; /* success */ ++ ++ if (au_dbtop(dentry) == bindex) ++ h_path->dentry = au_h_dptr(dentry, bindex); ++ else if (au_opt_test(mnt_flags, PLINK) && au_plink_test(inode)) { ++ h_path->dentry = au_plink_lkup(inode, bindex); ++ if (IS_ERR(h_path->dentry)) ++ /* pretending success */ ++ h_path->dentry = NULL; ++ else ++ dput(h_path->dentry); ++ } ++ ++out: ++ return err; ++} ++ ++static int aufs_getattr(struct vfsmount *mnt __maybe_unused, ++ struct dentry *dentry, struct kstat *st) ++{ ++ int err; ++ unsigned char positive; ++ struct path h_path; ++ struct inode *inode; ++ struct super_block *sb; ++ ++ inode = d_inode(dentry); ++ sb = dentry->d_sb; ++ err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (unlikely(err)) ++ goto out; ++ err = au_h_path_getattr(dentry, /*force*/0, &h_path); ++ if (unlikely(err)) ++ goto out_si; ++ if (unlikely(!h_path.dentry)) ++ /* illegally overlapped or something */ ++ goto out_fill; /* pretending success */ ++ ++ positive = d_is_positive(h_path.dentry); ++ if (positive) ++ err = vfs_getattr(&h_path, st); ++ if (!err) { ++ if (positive) ++ au_refresh_iattr(inode, st, ++ d_inode(h_path.dentry)->i_nlink); ++ goto out_fill; /* success */ ++ } ++ AuTraceErr(err); ++ goto out_di; ++ ++out_fill: ++ generic_fillattr(inode, st); ++out_di: ++ di_read_unlock(dentry, AuLock_IR); ++out_si: ++ si_read_unlock(sb); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static const char *aufs_get_link(struct dentry *dentry, struct inode *inode, ++ struct delayed_call *done) ++{ ++ const char *ret; ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ int err; ++ aufs_bindex_t bindex; ++ ++ ret = NULL; /* suppress a warning */ ++ err = -ECHILD; ++ if (!dentry) ++ goto out; ++ ++ err = aufs_read_lock(dentry, AuLock_IR | AuLock_GEN); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_d_hashed_positive(dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ err = -EINVAL; ++ inode = d_inode(dentry); ++ bindex = au_ibtop(inode); ++ h_inode = au_h_iptr(inode, bindex); ++ if (unlikely(!h_inode->i_op->get_link)) ++ goto out_unlock; ++ ++ err = -EBUSY; ++ h_dentry = NULL; ++ if (au_dbtop(dentry) <= bindex) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry) ++ dget(h_dentry); ++ } ++ if (!h_dentry) { ++ h_dentry = d_find_any_alias(h_inode); ++ if (IS_ERR(h_dentry)) { ++ err = PTR_ERR(h_dentry); ++ goto out_unlock; ++ } ++ } ++ if (unlikely(!h_dentry)) ++ goto out_unlock; ++ ++ err = 0; ++ AuDbg("%pf\n", h_inode->i_op->get_link); ++ AuDbgDentry(h_dentry); ++ ret = h_inode->i_op->get_link(h_dentry, h_inode, done); ++ dput(h_dentry); ++ if (IS_ERR(ret)) ++ err = PTR_ERR(ret); ++ ++out_unlock: ++ aufs_read_unlock(dentry, AuLock_IR); ++out: ++ if (unlikely(err)) ++ ret = ERR_PTR(err); ++ AuTraceErrPtr(ret); ++ return ret; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_update_time(struct inode *inode, struct timespec *ts, int flags) ++{ ++ int err; ++ struct super_block *sb; ++ struct inode *h_inode; ++ ++ sb = inode->i_sb; ++ /* mmap_sem might be acquired already, cf. aufs_mmap() */ ++ lockdep_off(); ++ si_read_lock(sb, AuLock_FLUSH); ++ ii_write_lock_child(inode); ++ lockdep_on(); ++ h_inode = au_h_iptr(inode, au_ibtop(inode)); ++ err = vfsub_update_time(h_inode, ts, flags); ++ lockdep_off(); ++ if (!err) ++ au_cpup_attr_timesizes(inode); ++ ii_write_unlock(inode); ++ si_read_unlock(sb); ++ lockdep_on(); ++ ++ if (!err && (flags & S_VERSION)) ++ inode_inc_iversion(inode); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* no getattr version will be set by module.c:aufs_init() */ ++struct inode_operations aufs_iop_nogetattr[AuIop_Last], ++ aufs_iop[] = { ++ [AuIop_SYMLINK] = { ++ .permission = aufs_permission, ++#ifdef CONFIG_FS_POSIX_ACL ++ .get_acl = aufs_get_acl, ++ .set_acl = aufs_set_acl, /* unsupport for symlink? */ ++#endif ++ ++ .setattr = aufs_setattr, ++ .getattr = aufs_getattr, ++ ++#ifdef CONFIG_AUFS_XATTR ++ .setxattr = aufs_setxattr, ++ .getxattr = aufs_getxattr, ++ .listxattr = aufs_listxattr, ++ .removexattr = aufs_removexattr, ++#endif ++ ++ .readlink = generic_readlink, ++ .get_link = aufs_get_link, ++ ++ /* .update_time = aufs_update_time */ ++ }, ++ [AuIop_DIR] = { ++ .create = aufs_create, ++ .lookup = aufs_lookup, ++ .link = aufs_link, ++ .unlink = aufs_unlink, ++ .symlink = aufs_symlink, ++ .mkdir = aufs_mkdir, ++ .rmdir = aufs_rmdir, ++ .mknod = aufs_mknod, ++ .rename = aufs_rename, ++ ++ .permission = aufs_permission, ++#ifdef CONFIG_FS_POSIX_ACL ++ .get_acl = aufs_get_acl, ++ .set_acl = aufs_set_acl, ++#endif ++ ++ .setattr = aufs_setattr, ++ .getattr = aufs_getattr, ++ ++#ifdef CONFIG_AUFS_XATTR ++ .setxattr = aufs_setxattr, ++ .getxattr = aufs_getxattr, ++ .listxattr = aufs_listxattr, ++ .removexattr = aufs_removexattr, ++#endif ++ ++ .update_time = aufs_update_time, ++ .atomic_open = aufs_atomic_open, ++ .tmpfile = aufs_tmpfile ++ }, ++ [AuIop_OTHER] = { ++ .permission = aufs_permission, ++#ifdef CONFIG_FS_POSIX_ACL ++ .get_acl = aufs_get_acl, ++ .set_acl = aufs_set_acl, ++#endif ++ ++ .setattr = aufs_setattr, ++ .getattr = aufs_getattr, ++ ++#ifdef CONFIG_AUFS_XATTR ++ .setxattr = aufs_setxattr, ++ .getxattr = aufs_getxattr, ++ .listxattr = aufs_listxattr, ++ .removexattr = aufs_removexattr, ++#endif ++ ++ .update_time = aufs_update_time ++ } ++}; +diff --git a/fs/aufs/i_op_add.c b/fs/aufs/i_op_add.c +new file mode 100644 +index 0000000..e0833c5 +--- /dev/null ++++ b/fs/aufs/i_op_add.c +@@ -0,0 +1,924 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * inode operations (add entry) ++ */ ++ ++#include "aufs.h" ++ ++/* ++ * final procedure of adding a new entry, except link(2). ++ * remove whiteout, instantiate, copyup the parent dir's times and size ++ * and update version. ++ * if it failed, re-create the removed whiteout. ++ */ ++static int epilog(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct dentry *dentry) ++{ ++ int err, rerr; ++ aufs_bindex_t bwh; ++ struct path h_path; ++ struct super_block *sb; ++ struct inode *inode, *h_dir; ++ struct dentry *wh; ++ ++ bwh = -1; ++ sb = dir->i_sb; ++ if (wh_dentry) { ++ h_dir = d_inode(wh_dentry->d_parent); /* dir inode is locked */ ++ IMustLock(h_dir); ++ AuDebugOn(au_h_iptr(dir, bindex) != h_dir); ++ bwh = au_dbwh(dentry); ++ h_path.dentry = wh_dentry; ++ h_path.mnt = au_sbr_mnt(sb, bindex); ++ err = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path, ++ dentry); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ inode = au_new_inode(dentry, /*must_new*/1); ++ if (!IS_ERR(inode)) { ++ d_instantiate(dentry, inode); ++ dir = d_inode(dentry->d_parent); /* dir inode is locked */ ++ IMustLock(dir); ++ au_dir_ts(dir, bindex); ++ dir->i_version++; ++ au_fhsm_wrote(sb, bindex, /*force*/0); ++ return 0; /* success */ ++ } ++ ++ err = PTR_ERR(inode); ++ if (!wh_dentry) ++ goto out; ++ ++ /* revert */ ++ /* dir inode is locked */ ++ wh = au_wh_create(dentry, bwh, wh_dentry->d_parent); ++ rerr = PTR_ERR(wh); ++ if (IS_ERR(wh)) { ++ AuIOErr("%pd reverting whiteout failed(%d, %d)\n", ++ dentry, err, rerr); ++ err = -EIO; ++ } else ++ dput(wh); ++ ++out: ++ return err; ++} ++ ++static int au_d_may_add(struct dentry *dentry) ++{ ++ int err; ++ ++ err = 0; ++ if (unlikely(d_unhashed(dentry))) ++ err = -ENOENT; ++ if (unlikely(d_really_is_positive(dentry))) ++ err = -EEXIST; ++ return err; ++} ++ ++/* ++ * simple tests for the adding inode operations. ++ * following the checks in vfs, plus the parent-child relationship. ++ */ ++int au_may_add(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent, int isdir) ++{ ++ int err; ++ umode_t h_mode; ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ ++ err = -ENAMETOOLONG; ++ if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN)) ++ goto out; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (d_really_is_negative(dentry)) { ++ err = -EEXIST; ++ if (unlikely(d_is_positive(h_dentry))) ++ goto out; ++ } else { ++ /* rename(2) case */ ++ err = -EIO; ++ if (unlikely(d_is_negative(h_dentry))) ++ goto out; ++ h_inode = d_inode(h_dentry); ++ if (unlikely(!h_inode->i_nlink)) ++ goto out; ++ ++ h_mode = h_inode->i_mode; ++ if (!isdir) { ++ err = -EISDIR; ++ if (unlikely(S_ISDIR(h_mode))) ++ goto out; ++ } else if (unlikely(!S_ISDIR(h_mode))) { ++ err = -ENOTDIR; ++ goto out; ++ } ++ } ++ ++ err = 0; ++ /* expected parent dir is locked */ ++ if (unlikely(h_parent != h_dentry->d_parent)) ++ err = -EIO; ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ++ * initial procedure of adding a new entry. ++ * prepare writable branch and the parent dir, lock it, ++ * and lookup whiteout for the new entry. ++ */ ++static struct dentry* ++lock_hdir_lkup_wh(struct dentry *dentry, struct au_dtime *dt, ++ struct dentry *src_dentry, struct au_pin *pin, ++ struct au_wr_dir_args *wr_dir_args) ++{ ++ struct dentry *wh_dentry, *h_parent; ++ struct super_block *sb; ++ struct au_branch *br; ++ int err; ++ unsigned int udba; ++ aufs_bindex_t bcpup; ++ ++ AuDbg("%pd\n", dentry); ++ ++ err = au_wr_dir(dentry, src_dentry, wr_dir_args); ++ bcpup = err; ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err < 0)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ udba = au_opt_udba(sb); ++ err = au_pin(pin, dentry, bcpup, udba, ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ ++ h_parent = au_pinned_h_parent(pin); ++ if (udba != AuOpt_UDBA_NONE ++ && au_dbtop(dentry) == bcpup) ++ err = au_may_add(dentry, bcpup, h_parent, ++ au_ftest_wrdir(wr_dir_args->flags, ISDIR)); ++ else if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN)) ++ err = -ENAMETOOLONG; ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_unpin; ++ ++ br = au_sbr(sb, bcpup); ++ if (dt) { ++ struct path tmp = { ++ .dentry = h_parent, ++ .mnt = au_br_mnt(br) ++ }; ++ au_dtime_store(dt, au_pinned_parent(pin), &tmp); ++ } ++ ++ wh_dentry = NULL; ++ if (bcpup != au_dbwh(dentry)) ++ goto out; /* success */ ++ ++ /* ++ * ENAMETOOLONG here means that if we allowed create such name, then it ++ * would not be able to removed in the future. So we don't allow such ++ * name here and we don't handle ENAMETOOLONG differently here. ++ */ ++ wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, br); ++ ++out_unpin: ++ if (IS_ERR(wh_dentry)) ++ au_unpin(pin); ++out: ++ return wh_dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++enum { Mknod, Symlink, Creat }; ++struct simple_arg { ++ int type; ++ union { ++ struct { ++ umode_t mode; ++ bool want_excl; ++ bool try_aopen; ++ struct vfsub_aopen_args *aopen; ++ } c; ++ struct { ++ const char *symname; ++ } s; ++ struct { ++ umode_t mode; ++ dev_t dev; ++ } m; ++ } u; ++}; ++ ++static int add_simple(struct inode *dir, struct dentry *dentry, ++ struct simple_arg *arg) ++{ ++ int err, rerr; ++ aufs_bindex_t btop; ++ unsigned char created; ++ const unsigned char try_aopen ++ = (arg->type == Creat && arg->u.c.try_aopen); ++ struct dentry *wh_dentry, *parent; ++ struct inode *h_dir; ++ struct super_block *sb; ++ struct au_branch *br; ++ /* to reuduce stack size */ ++ struct { ++ struct au_dtime dt; ++ struct au_pin pin; ++ struct path h_path; ++ struct au_wr_dir_args wr_dir_args; ++ } *a; ++ ++ AuDbg("%pd\n", dentry); ++ IMustLock(dir); ++ ++ err = -ENOMEM; ++ a = kmalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ a->wr_dir_args.force_btgt = -1; ++ a->wr_dir_args.flags = AuWrDir_ADD_ENTRY; ++ ++ parent = dentry->d_parent; /* dir inode is locked */ ++ if (!try_aopen) { ++ err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN); ++ if (unlikely(err)) ++ goto out_free; ++ } ++ err = au_d_may_add(dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ if (!try_aopen) ++ di_write_lock_parent(parent); ++ wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL, ++ &a->pin, &a->wr_dir_args); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_parent; ++ ++ btop = au_dbtop(dentry); ++ sb = dentry->d_sb; ++ br = au_sbr(sb, btop); ++ a->h_path.dentry = au_h_dptr(dentry, btop); ++ a->h_path.mnt = au_br_mnt(br); ++ h_dir = au_pinned_h_dir(&a->pin); ++ switch (arg->type) { ++ case Creat: ++ err = 0; ++ if (!try_aopen || !h_dir->i_op->atomic_open) ++ err = vfsub_create(h_dir, &a->h_path, arg->u.c.mode, ++ arg->u.c.want_excl); ++ else ++ err = vfsub_atomic_open(h_dir, a->h_path.dentry, ++ arg->u.c.aopen, br); ++ break; ++ case Symlink: ++ err = vfsub_symlink(h_dir, &a->h_path, arg->u.s.symname); ++ break; ++ case Mknod: ++ err = vfsub_mknod(h_dir, &a->h_path, arg->u.m.mode, ++ arg->u.m.dev); ++ break; ++ default: ++ BUG(); ++ } ++ created = !err; ++ if (!err) ++ err = epilog(dir, btop, wh_dentry, dentry); ++ ++ /* revert */ ++ if (unlikely(created && err && d_is_positive(a->h_path.dentry))) { ++ /* no delegation since it is just created */ ++ rerr = vfsub_unlink(h_dir, &a->h_path, /*delegated*/NULL, ++ /*force*/0); ++ if (rerr) { ++ AuIOErr("%pd revert failure(%d, %d)\n", ++ dentry, err, rerr); ++ err = -EIO; ++ } ++ au_dtime_revert(&a->dt); ++ } ++ ++ if (!err && try_aopen && !h_dir->i_op->atomic_open) ++ *arg->u.c.aopen->opened |= FILE_CREATED; ++ ++ au_unpin(&a->pin); ++ dput(wh_dentry); ++ ++out_parent: ++ if (!try_aopen) ++ di_write_unlock(parent); ++out_unlock: ++ if (unlikely(err)) { ++ au_update_dbtop(dentry); ++ d_drop(dentry); ++ } ++ if (!try_aopen) ++ aufs_read_unlock(dentry, AuLock_DW); ++out_free: ++ kfree(a); ++out: ++ return err; ++} ++ ++int aufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, ++ dev_t dev) ++{ ++ struct simple_arg arg = { ++ .type = Mknod, ++ .u.m = { ++ .mode = mode, ++ .dev = dev ++ } ++ }; ++ return add_simple(dir, dentry, &arg); ++} ++ ++int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) ++{ ++ struct simple_arg arg = { ++ .type = Symlink, ++ .u.s.symname = symname ++ }; ++ return add_simple(dir, dentry, &arg); ++} ++ ++int aufs_create(struct inode *dir, struct dentry *dentry, umode_t mode, ++ bool want_excl) ++{ ++ struct simple_arg arg = { ++ .type = Creat, ++ .u.c = { ++ .mode = mode, ++ .want_excl = want_excl ++ } ++ }; ++ return add_simple(dir, dentry, &arg); ++} ++ ++int au_aopen_or_create(struct inode *dir, struct dentry *dentry, ++ struct vfsub_aopen_args *aopen_args) ++{ ++ struct simple_arg arg = { ++ .type = Creat, ++ .u.c = { ++ .mode = aopen_args->create_mode, ++ .want_excl = aopen_args->open_flag & O_EXCL, ++ .try_aopen = true, ++ .aopen = aopen_args ++ } ++ }; ++ return add_simple(dir, dentry, &arg); ++} ++ ++int aufs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ struct super_block *sb; ++ struct dentry *parent, *h_parent, *h_dentry; ++ struct inode *h_dir, *inode; ++ struct vfsmount *h_mnt; ++ struct au_wr_dir_args wr_dir_args = { ++ .force_btgt = -1, ++ .flags = AuWrDir_TMPFILE ++ }; ++ ++ /* copy-up may happen */ ++ inode_lock(dir); ++ ++ sb = dir->i_sb; ++ err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_di_init(dentry); ++ if (unlikely(err)) ++ goto out_si; ++ ++ err = -EBUSY; ++ parent = d_find_any_alias(dir); ++ AuDebugOn(!parent); ++ di_write_lock_parent(parent); ++ if (unlikely(d_inode(parent) != dir)) ++ goto out_parent; ++ ++ err = au_digen_test(parent, au_sigen(sb)); ++ if (unlikely(err)) ++ goto out_parent; ++ ++ bindex = au_dbtop(parent); ++ au_set_dbtop(dentry, bindex); ++ au_set_dbbot(dentry, bindex); ++ err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args); ++ bindex = err; ++ if (unlikely(err < 0)) ++ goto out_parent; ++ ++ err = -EOPNOTSUPP; ++ h_dir = au_h_iptr(dir, bindex); ++ if (unlikely(!h_dir->i_op->tmpfile)) ++ goto out_parent; ++ ++ h_mnt = au_sbr_mnt(sb, bindex); ++ err = vfsub_mnt_want_write(h_mnt); ++ if (unlikely(err)) ++ goto out_parent; ++ ++ h_parent = au_h_dptr(parent, bindex); ++ err = inode_permission(d_inode(h_parent), MAY_WRITE | MAY_EXEC); ++ if (unlikely(err)) ++ goto out_mnt; ++ ++ err = -ENOMEM; ++ h_dentry = d_alloc(h_parent, &dentry->d_name); ++ if (unlikely(!h_dentry)) ++ goto out_mnt; ++ ++ err = h_dir->i_op->tmpfile(h_dir, h_dentry, mode); ++ if (unlikely(err)) ++ goto out_dentry; ++ ++ au_set_dbtop(dentry, bindex); ++ au_set_dbbot(dentry, bindex); ++ au_set_h_dptr(dentry, bindex, dget(h_dentry)); ++ inode = au_new_inode(dentry, /*must_new*/1); ++ if (IS_ERR(inode)) { ++ err = PTR_ERR(inode); ++ au_set_h_dptr(dentry, bindex, NULL); ++ au_set_dbtop(dentry, -1); ++ au_set_dbbot(dentry, -1); ++ } else { ++ if (!inode->i_nlink) ++ set_nlink(inode, 1); ++ d_tmpfile(dentry, inode); ++ au_di(dentry)->di_tmpfile = 1; ++ ++ /* update without i_mutex */ ++ if (au_ibtop(dir) == au_dbtop(dentry)) ++ au_cpup_attr_timesizes(dir); ++ } ++ ++out_dentry: ++ dput(h_dentry); ++out_mnt: ++ vfsub_mnt_drop_write(h_mnt); ++out_parent: ++ di_write_unlock(parent); ++ dput(parent); ++ di_write_unlock(dentry); ++ if (unlikely(err)) { ++ au_di_fin(dentry); ++ dentry->d_fsdata = NULL; ++ } ++out_si: ++ si_read_unlock(sb); ++out: ++ inode_unlock(dir); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_link_args { ++ aufs_bindex_t bdst, bsrc; ++ struct au_pin pin; ++ struct path h_path; ++ struct dentry *src_parent, *parent; ++}; ++ ++static int au_cpup_before_link(struct dentry *src_dentry, ++ struct au_link_args *a) ++{ ++ int err; ++ struct dentry *h_src_dentry; ++ struct au_cp_generic cpg = { ++ .dentry = src_dentry, ++ .bdst = a->bdst, ++ .bsrc = a->bsrc, ++ .len = -1, ++ .pin = &a->pin, ++ .flags = AuCpup_DTIME | AuCpup_HOPEN /* | AuCpup_KEEPLINO */ ++ }; ++ ++ di_read_lock_parent(a->src_parent, AuLock_IR); ++ err = au_test_and_cpup_dirs(src_dentry, a->bdst); ++ if (unlikely(err)) ++ goto out; ++ ++ h_src_dentry = au_h_dptr(src_dentry, a->bsrc); ++ err = au_pin(&a->pin, src_dentry, a->bdst, ++ au_opt_udba(src_dentry->d_sb), ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_sio_cpup_simple(&cpg); ++ au_unpin(&a->pin); ++ ++out: ++ di_read_unlock(a->src_parent, AuLock_IR); ++ return err; ++} ++ ++static int au_cpup_or_link(struct dentry *src_dentry, struct dentry *dentry, ++ struct au_link_args *a) ++{ ++ int err; ++ unsigned char plink; ++ aufs_bindex_t bbot; ++ struct dentry *h_src_dentry; ++ struct inode *h_inode, *inode, *delegated; ++ struct super_block *sb; ++ struct file *h_file; ++ ++ plink = 0; ++ h_inode = NULL; ++ sb = src_dentry->d_sb; ++ inode = d_inode(src_dentry); ++ if (au_ibtop(inode) <= a->bdst) ++ h_inode = au_h_iptr(inode, a->bdst); ++ if (!h_inode || !h_inode->i_nlink) { ++ /* copyup src_dentry as the name of dentry. */ ++ bbot = au_dbbot(dentry); ++ if (bbot < a->bsrc) ++ au_set_dbbot(dentry, a->bsrc); ++ au_set_h_dptr(dentry, a->bsrc, ++ dget(au_h_dptr(src_dentry, a->bsrc))); ++ dget(a->h_path.dentry); ++ au_set_h_dptr(dentry, a->bdst, NULL); ++ AuDbg("temporary d_inode...\n"); ++ spin_lock(&dentry->d_lock); ++ dentry->d_inode = d_inode(src_dentry); /* tmp */ ++ spin_unlock(&dentry->d_lock); ++ h_file = au_h_open_pre(dentry, a->bsrc, /*force_wr*/0); ++ if (IS_ERR(h_file)) ++ err = PTR_ERR(h_file); ++ else { ++ struct au_cp_generic cpg = { ++ .dentry = dentry, ++ .bdst = a->bdst, ++ .bsrc = -1, ++ .len = -1, ++ .pin = &a->pin, ++ .flags = AuCpup_KEEPLINO ++ }; ++ err = au_sio_cpup_simple(&cpg); ++ au_h_open_post(dentry, a->bsrc, h_file); ++ if (!err) { ++ dput(a->h_path.dentry); ++ a->h_path.dentry = au_h_dptr(dentry, a->bdst); ++ } else ++ au_set_h_dptr(dentry, a->bdst, ++ a->h_path.dentry); ++ } ++ spin_lock(&dentry->d_lock); ++ dentry->d_inode = NULL; /* restore */ ++ spin_unlock(&dentry->d_lock); ++ AuDbg("temporary d_inode...done\n"); ++ au_set_h_dptr(dentry, a->bsrc, NULL); ++ au_set_dbbot(dentry, bbot); ++ } else { ++ /* the inode of src_dentry already exists on a.bdst branch */ ++ h_src_dentry = d_find_alias(h_inode); ++ if (!h_src_dentry && au_plink_test(inode)) { ++ plink = 1; ++ h_src_dentry = au_plink_lkup(inode, a->bdst); ++ err = PTR_ERR(h_src_dentry); ++ if (IS_ERR(h_src_dentry)) ++ goto out; ++ ++ if (unlikely(d_is_negative(h_src_dentry))) { ++ dput(h_src_dentry); ++ h_src_dentry = NULL; ++ } ++ ++ } ++ if (h_src_dentry) { ++ delegated = NULL; ++ err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin), ++ &a->h_path, &delegated); ++ if (unlikely(err == -EWOULDBLOCK)) { ++ pr_warn("cannot retry for NFSv4 delegation" ++ " for an internal link\n"); ++ iput(delegated); ++ } ++ dput(h_src_dentry); ++ } else { ++ AuIOErr("no dentry found for hi%lu on b%d\n", ++ h_inode->i_ino, a->bdst); ++ err = -EIO; ++ } ++ } ++ ++ if (!err && !plink) ++ au_plink_append(inode, a->bdst, a->h_path.dentry); ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++int aufs_link(struct dentry *src_dentry, struct inode *dir, ++ struct dentry *dentry) ++{ ++ int err, rerr; ++ struct au_dtime dt; ++ struct au_link_args *a; ++ struct dentry *wh_dentry, *h_src_dentry; ++ struct inode *inode, *delegated; ++ struct super_block *sb; ++ struct au_wr_dir_args wr_dir_args = { ++ /* .force_btgt = -1, */ ++ .flags = AuWrDir_ADD_ENTRY ++ }; ++ ++ IMustLock(dir); ++ inode = d_inode(src_dentry); ++ IMustLock(inode); ++ ++ err = -ENOMEM; ++ a = kzalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ a->parent = dentry->d_parent; /* dir inode is locked */ ++ err = aufs_read_and_write_lock2(dentry, src_dentry, ++ AuLock_NOPLM | AuLock_GEN); ++ if (unlikely(err)) ++ goto out_kfree; ++ err = au_d_linkable(src_dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ err = au_d_may_add(dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ a->src_parent = dget_parent(src_dentry); ++ wr_dir_args.force_btgt = au_ibtop(inode); ++ ++ di_write_lock_parent(a->parent); ++ wr_dir_args.force_btgt = au_wbr(dentry, wr_dir_args.force_btgt); ++ wh_dentry = lock_hdir_lkup_wh(dentry, &dt, src_dentry, &a->pin, ++ &wr_dir_args); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_parent; ++ ++ err = 0; ++ sb = dentry->d_sb; ++ a->bdst = au_dbtop(dentry); ++ a->h_path.dentry = au_h_dptr(dentry, a->bdst); ++ a->h_path.mnt = au_sbr_mnt(sb, a->bdst); ++ a->bsrc = au_ibtop(inode); ++ h_src_dentry = au_h_d_alias(src_dentry, a->bsrc); ++ if (!h_src_dentry && au_di(src_dentry)->di_tmpfile) ++ h_src_dentry = dget(au_hi_wh(inode, a->bsrc)); ++ if (!h_src_dentry) { ++ a->bsrc = au_dbtop(src_dentry); ++ h_src_dentry = au_h_d_alias(src_dentry, a->bsrc); ++ AuDebugOn(!h_src_dentry); ++ } else if (IS_ERR(h_src_dentry)) { ++ err = PTR_ERR(h_src_dentry); ++ goto out_parent; ++ } ++ ++ if (au_opt_test(au_mntflags(sb), PLINK)) { ++ if (a->bdst < a->bsrc ++ /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) ++ err = au_cpup_or_link(src_dentry, dentry, a); ++ else { ++ delegated = NULL; ++ err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin), ++ &a->h_path, &delegated); ++ if (unlikely(err == -EWOULDBLOCK)) { ++ pr_warn("cannot retry for NFSv4 delegation" ++ " for an internal link\n"); ++ iput(delegated); ++ } ++ } ++ dput(h_src_dentry); ++ } else { ++ /* ++ * copyup src_dentry to the branch we process, ++ * and then link(2) to it. ++ */ ++ dput(h_src_dentry); ++ if (a->bdst < a->bsrc ++ /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) { ++ au_unpin(&a->pin); ++ di_write_unlock(a->parent); ++ err = au_cpup_before_link(src_dentry, a); ++ di_write_lock_parent(a->parent); ++ if (!err) ++ err = au_pin(&a->pin, dentry, a->bdst, ++ au_opt_udba(sb), ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (unlikely(err)) ++ goto out_wh; ++ } ++ if (!err) { ++ h_src_dentry = au_h_dptr(src_dentry, a->bdst); ++ err = -ENOENT; ++ if (h_src_dentry && d_is_positive(h_src_dentry)) { ++ delegated = NULL; ++ err = vfsub_link(h_src_dentry, ++ au_pinned_h_dir(&a->pin), ++ &a->h_path, &delegated); ++ if (unlikely(err == -EWOULDBLOCK)) { ++ pr_warn("cannot retry" ++ " for NFSv4 delegation" ++ " for an internal link\n"); ++ iput(delegated); ++ } ++ } ++ } ++ } ++ if (unlikely(err)) ++ goto out_unpin; ++ ++ if (wh_dentry) { ++ a->h_path.dentry = wh_dentry; ++ err = au_wh_unlink_dentry(au_pinned_h_dir(&a->pin), &a->h_path, ++ dentry); ++ if (unlikely(err)) ++ goto out_revert; ++ } ++ ++ au_dir_ts(dir, a->bdst); ++ dir->i_version++; ++ inc_nlink(inode); ++ inode->i_ctime = dir->i_ctime; ++ d_instantiate(dentry, au_igrab(inode)); ++ if (d_unhashed(a->h_path.dentry)) ++ /* some filesystem calls d_drop() */ ++ d_drop(dentry); ++ /* some filesystems consume an inode even hardlink */ ++ au_fhsm_wrote(sb, a->bdst, /*force*/0); ++ goto out_unpin; /* success */ ++ ++out_revert: ++ /* no delegation since it is just created */ ++ rerr = vfsub_unlink(au_pinned_h_dir(&a->pin), &a->h_path, ++ /*delegated*/NULL, /*force*/0); ++ if (unlikely(rerr)) { ++ AuIOErr("%pd reverting failed(%d, %d)\n", dentry, err, rerr); ++ err = -EIO; ++ } ++ au_dtime_revert(&dt); ++out_unpin: ++ au_unpin(&a->pin); ++out_wh: ++ dput(wh_dentry); ++out_parent: ++ di_write_unlock(a->parent); ++ dput(a->src_parent); ++out_unlock: ++ if (unlikely(err)) { ++ au_update_dbtop(dentry); ++ d_drop(dentry); ++ } ++ aufs_read_and_write_unlock2(dentry, src_dentry); ++out_kfree: ++ kfree(a); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++int aufs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ++{ ++ int err, rerr; ++ aufs_bindex_t bindex; ++ unsigned char diropq; ++ struct path h_path; ++ struct dentry *wh_dentry, *parent, *opq_dentry; ++ struct inode *h_inode; ++ struct super_block *sb; ++ struct { ++ struct au_pin pin; ++ struct au_dtime dt; ++ } *a; /* reduce the stack usage */ ++ struct au_wr_dir_args wr_dir_args = { ++ .force_btgt = -1, ++ .flags = AuWrDir_ADD_ENTRY | AuWrDir_ISDIR ++ }; ++ ++ IMustLock(dir); ++ ++ err = -ENOMEM; ++ a = kmalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN); ++ if (unlikely(err)) ++ goto out_free; ++ err = au_d_may_add(dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ parent = dentry->d_parent; /* dir inode is locked */ ++ di_write_lock_parent(parent); ++ wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL, ++ &a->pin, &wr_dir_args); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_parent; ++ ++ sb = dentry->d_sb; ++ bindex = au_dbtop(dentry); ++ h_path.dentry = au_h_dptr(dentry, bindex); ++ h_path.mnt = au_sbr_mnt(sb, bindex); ++ err = vfsub_mkdir(au_pinned_h_dir(&a->pin), &h_path, mode); ++ if (unlikely(err)) ++ goto out_unpin; ++ ++ /* make the dir opaque */ ++ diropq = 0; ++ h_inode = d_inode(h_path.dentry); ++ if (wh_dentry ++ || au_opt_test(au_mntflags(sb), ALWAYS_DIROPQ)) { ++ inode_lock_nested(h_inode, AuLsc_I_CHILD); ++ opq_dentry = au_diropq_create(dentry, bindex); ++ inode_unlock(h_inode); ++ err = PTR_ERR(opq_dentry); ++ if (IS_ERR(opq_dentry)) ++ goto out_dir; ++ dput(opq_dentry); ++ diropq = 1; ++ } ++ ++ err = epilog(dir, bindex, wh_dentry, dentry); ++ if (!err) { ++ inc_nlink(dir); ++ goto out_unpin; /* success */ ++ } ++ ++ /* revert */ ++ if (diropq) { ++ AuLabel(revert opq); ++ inode_lock_nested(h_inode, AuLsc_I_CHILD); ++ rerr = au_diropq_remove(dentry, bindex); ++ inode_unlock(h_inode); ++ if (rerr) { ++ AuIOErr("%pd reverting diropq failed(%d, %d)\n", ++ dentry, err, rerr); ++ err = -EIO; ++ } ++ } ++ ++out_dir: ++ AuLabel(revert dir); ++ rerr = vfsub_rmdir(au_pinned_h_dir(&a->pin), &h_path); ++ if (rerr) { ++ AuIOErr("%pd reverting dir failed(%d, %d)\n", ++ dentry, err, rerr); ++ err = -EIO; ++ } ++ au_dtime_revert(&a->dt); ++out_unpin: ++ au_unpin(&a->pin); ++ dput(wh_dentry); ++out_parent: ++ di_write_unlock(parent); ++out_unlock: ++ if (unlikely(err)) { ++ au_update_dbtop(dentry); ++ d_drop(dentry); ++ } ++ aufs_read_unlock(dentry, AuLock_DW); ++out_free: ++ kfree(a); ++out: ++ return err; ++} +diff --git a/fs/aufs/i_op_del.c b/fs/aufs/i_op_del.c +new file mode 100644 +index 0000000..3fe30b6 +--- /dev/null ++++ b/fs/aufs/i_op_del.c +@@ -0,0 +1,510 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * inode operations (del entry) ++ */ ++ ++#include "aufs.h" ++ ++/* ++ * decide if a new whiteout for @dentry is necessary or not. ++ * when it is necessary, prepare the parent dir for the upper branch whose ++ * branch index is @bcpup for creation. the actual creation of the whiteout will ++ * be done by caller. ++ * return value: ++ * 0: wh is unnecessary ++ * plus: wh is necessary ++ * minus: error ++ */ ++int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup) ++{ ++ int need_wh, err; ++ aufs_bindex_t btop; ++ struct super_block *sb; ++ ++ sb = dentry->d_sb; ++ btop = au_dbtop(dentry); ++ if (*bcpup < 0) { ++ *bcpup = btop; ++ if (au_test_ro(sb, btop, d_inode(dentry))) { ++ err = AuWbrCopyup(au_sbi(sb), dentry); ++ *bcpup = err; ++ if (unlikely(err < 0)) ++ goto out; ++ } ++ } else ++ AuDebugOn(btop < *bcpup ++ || au_test_ro(sb, *bcpup, d_inode(dentry))); ++ AuDbg("bcpup %d, btop %d\n", *bcpup, btop); ++ ++ if (*bcpup != btop) { ++ err = au_cpup_dirs(dentry, *bcpup); ++ if (unlikely(err)) ++ goto out; ++ need_wh = 1; ++ } else { ++ struct au_dinfo *dinfo, *tmp; ++ ++ need_wh = -ENOMEM; ++ dinfo = au_di(dentry); ++ tmp = au_di_alloc(sb, AuLsc_DI_TMP); ++ if (tmp) { ++ au_di_cp(tmp, dinfo); ++ au_di_swap(tmp, dinfo); ++ /* returns the number of positive dentries */ ++ need_wh = au_lkup_dentry(dentry, btop + 1, /*type*/0); ++ au_di_swap(tmp, dinfo); ++ au_rw_write_unlock(&tmp->di_rwsem); ++ au_di_free(tmp); ++ } ++ } ++ AuDbg("need_wh %d\n", need_wh); ++ err = need_wh; ++ ++out: ++ return err; ++} ++ ++/* ++ * simple tests for the del-entry operations. ++ * following the checks in vfs, plus the parent-child relationship. ++ */ ++int au_may_del(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent, int isdir) ++{ ++ int err; ++ umode_t h_mode; ++ struct dentry *h_dentry, *h_latest; ++ struct inode *h_inode; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (d_really_is_positive(dentry)) { ++ err = -ENOENT; ++ if (unlikely(d_is_negative(h_dentry))) ++ goto out; ++ h_inode = d_inode(h_dentry); ++ if (unlikely(!h_inode->i_nlink)) ++ goto out; ++ ++ h_mode = h_inode->i_mode; ++ if (!isdir) { ++ err = -EISDIR; ++ if (unlikely(S_ISDIR(h_mode))) ++ goto out; ++ } else if (unlikely(!S_ISDIR(h_mode))) { ++ err = -ENOTDIR; ++ goto out; ++ } ++ } else { ++ /* rename(2) case */ ++ err = -EIO; ++ if (unlikely(d_is_positive(h_dentry))) ++ goto out; ++ } ++ ++ err = -ENOENT; ++ /* expected parent dir is locked */ ++ if (unlikely(h_parent != h_dentry->d_parent)) ++ goto out; ++ err = 0; ++ ++ /* ++ * rmdir a dir may break the consistency on some filesystem. ++ * let's try heavy test. ++ */ ++ err = -EACCES; ++ if (unlikely(!au_opt_test(au_mntflags(dentry->d_sb), DIRPERM1) ++ && au_test_h_perm(d_inode(h_parent), ++ MAY_EXEC | MAY_WRITE))) ++ goto out; ++ ++ h_latest = au_sio_lkup_one(&dentry->d_name, h_parent); ++ err = -EIO; ++ if (IS_ERR(h_latest)) ++ goto out; ++ if (h_latest == h_dentry) ++ err = 0; ++ dput(h_latest); ++ ++out: ++ return err; ++} ++ ++/* ++ * decide the branch where we operate for @dentry. the branch index will be set ++ * @rbcpup. after diciding it, 'pin' it and store the timestamps of the parent ++ * dir for reverting. ++ * when a new whiteout is necessary, create it. ++ */ ++static struct dentry* ++lock_hdir_create_wh(struct dentry *dentry, int isdir, aufs_bindex_t *rbcpup, ++ struct au_dtime *dt, struct au_pin *pin) ++{ ++ struct dentry *wh_dentry; ++ struct super_block *sb; ++ struct path h_path; ++ int err, need_wh; ++ unsigned int udba; ++ aufs_bindex_t bcpup; ++ ++ need_wh = au_wr_dir_need_wh(dentry, isdir, rbcpup); ++ wh_dentry = ERR_PTR(need_wh); ++ if (unlikely(need_wh < 0)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ udba = au_opt_udba(sb); ++ bcpup = *rbcpup; ++ err = au_pin(pin, dentry, bcpup, udba, ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ ++ h_path.dentry = au_pinned_h_parent(pin); ++ if (udba != AuOpt_UDBA_NONE ++ && au_dbtop(dentry) == bcpup) { ++ err = au_may_del(dentry, bcpup, h_path.dentry, isdir); ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_unpin; ++ } ++ ++ h_path.mnt = au_sbr_mnt(sb, bcpup); ++ au_dtime_store(dt, au_pinned_parent(pin), &h_path); ++ wh_dentry = NULL; ++ if (!need_wh) ++ goto out; /* success, no need to create whiteout */ ++ ++ wh_dentry = au_wh_create(dentry, bcpup, h_path.dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_unpin; ++ ++ /* returns with the parent is locked and wh_dentry is dget-ed */ ++ goto out; /* success */ ++ ++out_unpin: ++ au_unpin(pin); ++out: ++ return wh_dentry; ++} ++ ++/* ++ * when removing a dir, rename it to a unique temporary whiteout-ed name first ++ * in order to be revertible and save time for removing many child whiteouts ++ * under the dir. ++ * returns 1 when there are too many child whiteout and caller should remove ++ * them asynchronously. returns 0 when the number of children is enough small to ++ * remove now or the branch fs is a remote fs. ++ * otherwise return an error. ++ */ ++static int renwh_and_rmdir(struct dentry *dentry, aufs_bindex_t bindex, ++ struct au_nhash *whlist, struct inode *dir) ++{ ++ int rmdir_later, err, dirwh; ++ struct dentry *h_dentry; ++ struct super_block *sb; ++ struct inode *inode; ++ ++ sb = dentry->d_sb; ++ SiMustAnyLock(sb); ++ h_dentry = au_h_dptr(dentry, bindex); ++ err = au_whtmp_ren(h_dentry, au_sbr(sb, bindex)); ++ if (unlikely(err)) ++ goto out; ++ ++ /* stop monitoring */ ++ inode = d_inode(dentry); ++ au_hn_free(au_hi(inode, bindex)); ++ ++ if (!au_test_fs_remote(h_dentry->d_sb)) { ++ dirwh = au_sbi(sb)->si_dirwh; ++ rmdir_later = (dirwh <= 1); ++ if (!rmdir_later) ++ rmdir_later = au_nhash_test_longer_wh(whlist, bindex, ++ dirwh); ++ if (rmdir_later) ++ return rmdir_later; ++ } ++ ++ err = au_whtmp_rmdir(dir, bindex, h_dentry, whlist); ++ if (unlikely(err)) { ++ AuIOErr("rmdir %pd, b%d failed, %d. ignored\n", ++ h_dentry, bindex, err); ++ err = 0; ++ } ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ++ * final procedure for deleting a entry. ++ * maintain dentry and iattr. ++ */ ++static void epilog(struct inode *dir, struct dentry *dentry, ++ aufs_bindex_t bindex) ++{ ++ struct inode *inode; ++ ++ inode = d_inode(dentry); ++ d_drop(dentry); ++ inode->i_ctime = dir->i_ctime; ++ ++ au_dir_ts(dir, bindex); ++ dir->i_version++; ++} ++ ++/* ++ * when an error happened, remove the created whiteout and revert everything. ++ */ ++static int do_revert(int err, struct inode *dir, aufs_bindex_t bindex, ++ aufs_bindex_t bwh, struct dentry *wh_dentry, ++ struct dentry *dentry, struct au_dtime *dt) ++{ ++ int rerr; ++ struct path h_path = { ++ .dentry = wh_dentry, ++ .mnt = au_sbr_mnt(dir->i_sb, bindex) ++ }; ++ ++ rerr = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path, dentry); ++ if (!rerr) { ++ au_set_dbwh(dentry, bwh); ++ au_dtime_revert(dt); ++ return 0; ++ } ++ ++ AuIOErr("%pd reverting whiteout failed(%d, %d)\n", dentry, err, rerr); ++ return -EIO; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int aufs_unlink(struct inode *dir, struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bwh, bindex, btop; ++ struct inode *inode, *h_dir, *delegated; ++ struct dentry *parent, *wh_dentry; ++ /* to reuduce stack size */ ++ struct { ++ struct au_dtime dt; ++ struct au_pin pin; ++ struct path h_path; ++ } *a; ++ ++ IMustLock(dir); ++ ++ err = -ENOMEM; ++ a = kmalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN); ++ if (unlikely(err)) ++ goto out_free; ++ err = au_d_hashed_positive(dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ inode = d_inode(dentry); ++ IMustLock(inode); ++ err = -EISDIR; ++ if (unlikely(d_is_dir(dentry))) ++ goto out_unlock; /* possible? */ ++ ++ btop = au_dbtop(dentry); ++ bwh = au_dbwh(dentry); ++ bindex = -1; ++ parent = dentry->d_parent; /* dir inode is locked */ ++ di_write_lock_parent(parent); ++ wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/0, &bindex, &a->dt, ++ &a->pin); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_parent; ++ ++ a->h_path.mnt = au_sbr_mnt(dentry->d_sb, btop); ++ a->h_path.dentry = au_h_dptr(dentry, btop); ++ dget(a->h_path.dentry); ++ if (bindex == btop) { ++ h_dir = au_pinned_h_dir(&a->pin); ++ delegated = NULL; ++ err = vfsub_unlink(h_dir, &a->h_path, &delegated, /*force*/0); ++ if (unlikely(err == -EWOULDBLOCK)) { ++ pr_warn("cannot retry for NFSv4 delegation" ++ " for an internal unlink\n"); ++ iput(delegated); ++ } ++ } else { ++ /* dir inode is locked */ ++ h_dir = d_inode(wh_dentry->d_parent); ++ IMustLock(h_dir); ++ err = 0; ++ } ++ ++ if (!err) { ++ vfsub_drop_nlink(inode); ++ epilog(dir, dentry, bindex); ++ ++ /* update target timestamps */ ++ if (bindex == btop) { ++ vfsub_update_h_iattr(&a->h_path, /*did*/NULL); ++ /*ignore*/ ++ inode->i_ctime = d_inode(a->h_path.dentry)->i_ctime; ++ } else ++ /* todo: this timestamp may be reverted later */ ++ inode->i_ctime = h_dir->i_ctime; ++ goto out_unpin; /* success */ ++ } ++ ++ /* revert */ ++ if (wh_dentry) { ++ int rerr; ++ ++ rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry, ++ &a->dt); ++ if (rerr) ++ err = rerr; ++ } ++ ++out_unpin: ++ au_unpin(&a->pin); ++ dput(wh_dentry); ++ dput(a->h_path.dentry); ++out_parent: ++ di_write_unlock(parent); ++out_unlock: ++ aufs_read_unlock(dentry, AuLock_DW); ++out_free: ++ kfree(a); ++out: ++ return err; ++} ++ ++int aufs_rmdir(struct inode *dir, struct dentry *dentry) ++{ ++ int err, rmdir_later; ++ aufs_bindex_t bwh, bindex, btop; ++ struct inode *inode; ++ struct dentry *parent, *wh_dentry, *h_dentry; ++ struct au_whtmp_rmdir *args; ++ /* to reuduce stack size */ ++ struct { ++ struct au_dtime dt; ++ struct au_pin pin; ++ } *a; ++ ++ IMustLock(dir); ++ ++ err = -ENOMEM; ++ a = kmalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_GEN); ++ if (unlikely(err)) ++ goto out_free; ++ err = au_alive_dir(dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ inode = d_inode(dentry); ++ IMustLock(inode); ++ err = -ENOTDIR; ++ if (unlikely(!d_is_dir(dentry))) ++ goto out_unlock; /* possible? */ ++ ++ err = -ENOMEM; ++ args = au_whtmp_rmdir_alloc(dir->i_sb, GFP_NOFS); ++ if (unlikely(!args)) ++ goto out_unlock; ++ ++ parent = dentry->d_parent; /* dir inode is locked */ ++ di_write_lock_parent(parent); ++ err = au_test_empty(dentry, &args->whlist); ++ if (unlikely(err)) ++ goto out_parent; ++ ++ btop = au_dbtop(dentry); ++ bwh = au_dbwh(dentry); ++ bindex = -1; ++ wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/1, &bindex, &a->dt, ++ &a->pin); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_parent; ++ ++ h_dentry = au_h_dptr(dentry, btop); ++ dget(h_dentry); ++ rmdir_later = 0; ++ if (bindex == btop) { ++ err = renwh_and_rmdir(dentry, btop, &args->whlist, dir); ++ if (err > 0) { ++ rmdir_later = err; ++ err = 0; ++ } ++ } else { ++ /* stop monitoring */ ++ au_hn_free(au_hi(inode, btop)); ++ ++ /* dir inode is locked */ ++ IMustLock(d_inode(wh_dentry->d_parent)); ++ err = 0; ++ } ++ ++ if (!err) { ++ vfsub_dead_dir(inode); ++ au_set_dbdiropq(dentry, -1); ++ epilog(dir, dentry, bindex); ++ ++ if (rmdir_later) { ++ au_whtmp_kick_rmdir(dir, btop, h_dentry, args); ++ args = NULL; ++ } ++ ++ goto out_unpin; /* success */ ++ } ++ ++ /* revert */ ++ AuLabel(revert); ++ if (wh_dentry) { ++ int rerr; ++ ++ rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry, ++ &a->dt); ++ if (rerr) ++ err = rerr; ++ } ++ ++out_unpin: ++ au_unpin(&a->pin); ++ dput(wh_dentry); ++ dput(h_dentry); ++out_parent: ++ di_write_unlock(parent); ++ if (args) ++ au_whtmp_rmdir_free(args); ++out_unlock: ++ aufs_read_unlock(dentry, AuLock_DW); ++out_free: ++ kfree(a); ++out: ++ AuTraceErr(err); ++ return err; ++} +diff --git a/fs/aufs/i_op_ren.c b/fs/aufs/i_op_ren.c +new file mode 100644 +index 0000000..cd2204b +--- /dev/null ++++ b/fs/aufs/i_op_ren.c +@@ -0,0 +1,1015 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * inode operation (rename entry) ++ * todo: this is crazy monster ++ */ ++ ++#include "aufs.h" ++ ++enum { AuSRC, AuDST, AuSrcDst }; ++enum { AuPARENT, AuCHILD, AuParentChild }; ++ ++#define AuRen_ISDIR 1 ++#define AuRen_ISSAMEDIR (1 << 1) ++#define AuRen_WHSRC (1 << 2) ++#define AuRen_WHDST (1 << 3) ++#define AuRen_MNT_WRITE (1 << 4) ++#define AuRen_DT_DSTDIR (1 << 5) ++#define AuRen_DIROPQ (1 << 6) ++#define au_ftest_ren(flags, name) ((flags) & AuRen_##name) ++#define au_fset_ren(flags, name) \ ++ do { (flags) |= AuRen_##name; } while (0) ++#define au_fclr_ren(flags, name) \ ++ do { (flags) &= ~AuRen_##name; } while (0) ++ ++struct au_ren_args { ++ struct { ++ struct dentry *dentry, *h_dentry, *parent, *h_parent, ++ *wh_dentry; ++ struct inode *dir, *inode; ++ struct au_hinode *hdir; ++ struct au_dtime dt[AuParentChild]; ++ aufs_bindex_t btop; ++ } sd[AuSrcDst]; ++ ++#define src_dentry sd[AuSRC].dentry ++#define src_dir sd[AuSRC].dir ++#define src_inode sd[AuSRC].inode ++#define src_h_dentry sd[AuSRC].h_dentry ++#define src_parent sd[AuSRC].parent ++#define src_h_parent sd[AuSRC].h_parent ++#define src_wh_dentry sd[AuSRC].wh_dentry ++#define src_hdir sd[AuSRC].hdir ++#define src_h_dir sd[AuSRC].hdir->hi_inode ++#define src_dt sd[AuSRC].dt ++#define src_btop sd[AuSRC].btop ++ ++#define dst_dentry sd[AuDST].dentry ++#define dst_dir sd[AuDST].dir ++#define dst_inode sd[AuDST].inode ++#define dst_h_dentry sd[AuDST].h_dentry ++#define dst_parent sd[AuDST].parent ++#define dst_h_parent sd[AuDST].h_parent ++#define dst_wh_dentry sd[AuDST].wh_dentry ++#define dst_hdir sd[AuDST].hdir ++#define dst_h_dir sd[AuDST].hdir->hi_inode ++#define dst_dt sd[AuDST].dt ++#define dst_btop sd[AuDST].btop ++ ++ struct dentry *h_trap; ++ struct au_branch *br; ++ struct au_hinode *src_hinode; ++ struct path h_path; ++ struct au_nhash whlist; ++ aufs_bindex_t btgt, src_bwh, src_bdiropq; ++ ++ unsigned int flags; ++ ++ struct au_whtmp_rmdir *thargs; ++ struct dentry *h_dst; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * functions for reverting. ++ * when an error happened in a single rename systemcall, we should revert ++ * everything as if nothing happened. ++ * we don't need to revert the copied-up/down the parent dir since they are ++ * harmless. ++ */ ++ ++#define RevertFailure(fmt, ...) do { \ ++ AuIOErr("revert failure: " fmt " (%d, %d)\n", \ ++ ##__VA_ARGS__, err, rerr); \ ++ err = -EIO; \ ++} while (0) ++ ++static void au_ren_rev_diropq(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ ++ au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD); ++ rerr = au_diropq_remove(a->src_dentry, a->btgt); ++ au_hn_imtx_unlock(a->src_hinode); ++ au_set_dbdiropq(a->src_dentry, a->src_bdiropq); ++ if (rerr) ++ RevertFailure("remove diropq %pd", a->src_dentry); ++} ++ ++static void au_ren_rev_rename(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ struct inode *delegated; ++ ++ a->h_path.dentry = vfsub_lkup_one(&a->src_dentry->d_name, ++ a->src_h_parent); ++ rerr = PTR_ERR(a->h_path.dentry); ++ if (IS_ERR(a->h_path.dentry)) { ++ RevertFailure("lkup one %pd", a->src_dentry); ++ return; ++ } ++ ++ delegated = NULL; ++ rerr = vfsub_rename(a->dst_h_dir, ++ au_h_dptr(a->src_dentry, a->btgt), ++ a->src_h_dir, &a->h_path, &delegated); ++ if (unlikely(rerr == -EWOULDBLOCK)) { ++ pr_warn("cannot retry for NFSv4 delegation" ++ " for an internal rename\n"); ++ iput(delegated); ++ } ++ d_drop(a->h_path.dentry); ++ dput(a->h_path.dentry); ++ /* au_set_h_dptr(a->src_dentry, a->btgt, NULL); */ ++ if (rerr) ++ RevertFailure("rename %pd", a->src_dentry); ++} ++ ++static void au_ren_rev_whtmp(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ struct inode *delegated; ++ ++ a->h_path.dentry = vfsub_lkup_one(&a->dst_dentry->d_name, ++ a->dst_h_parent); ++ rerr = PTR_ERR(a->h_path.dentry); ++ if (IS_ERR(a->h_path.dentry)) { ++ RevertFailure("lkup one %pd", a->dst_dentry); ++ return; ++ } ++ if (d_is_positive(a->h_path.dentry)) { ++ d_drop(a->h_path.dentry); ++ dput(a->h_path.dentry); ++ return; ++ } ++ ++ delegated = NULL; ++ rerr = vfsub_rename(a->dst_h_dir, a->h_dst, a->dst_h_dir, &a->h_path, ++ &delegated); ++ if (unlikely(rerr == -EWOULDBLOCK)) { ++ pr_warn("cannot retry for NFSv4 delegation" ++ " for an internal rename\n"); ++ iput(delegated); ++ } ++ d_drop(a->h_path.dentry); ++ dput(a->h_path.dentry); ++ if (!rerr) ++ au_set_h_dptr(a->dst_dentry, a->btgt, dget(a->h_dst)); ++ else ++ RevertFailure("rename %pd", a->h_dst); ++} ++ ++static void au_ren_rev_whsrc(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ ++ a->h_path.dentry = a->src_wh_dentry; ++ rerr = au_wh_unlink_dentry(a->src_h_dir, &a->h_path, a->src_dentry); ++ au_set_dbwh(a->src_dentry, a->src_bwh); ++ if (rerr) ++ RevertFailure("unlink %pd", a->src_wh_dentry); ++} ++#undef RevertFailure ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * when we have to copyup the renaming entry, do it with the rename-target name ++ * in order to minimize the cost (the later actual rename is unnecessary). ++ * otherwise rename it on the target branch. ++ */ ++static int au_ren_or_cpup(struct au_ren_args *a) ++{ ++ int err; ++ struct dentry *d; ++ struct inode *delegated; ++ ++ d = a->src_dentry; ++ if (au_dbtop(d) == a->btgt) { ++ a->h_path.dentry = a->dst_h_dentry; ++ if (au_ftest_ren(a->flags, DIROPQ) ++ && au_dbdiropq(d) == a->btgt) ++ au_fclr_ren(a->flags, DIROPQ); ++ AuDebugOn(au_dbtop(d) != a->btgt); ++ delegated = NULL; ++ err = vfsub_rename(a->src_h_dir, au_h_dptr(d, a->btgt), ++ a->dst_h_dir, &a->h_path, &delegated); ++ if (unlikely(err == -EWOULDBLOCK)) { ++ pr_warn("cannot retry for NFSv4 delegation" ++ " for an internal rename\n"); ++ iput(delegated); ++ } ++ } else ++ BUG(); ++ ++ if (!err && a->h_dst) ++ /* it will be set to dinfo later */ ++ dget(a->h_dst); ++ ++ return err; ++} ++ ++/* cf. aufs_rmdir() */ ++static int au_ren_del_whtmp(struct au_ren_args *a) ++{ ++ int err; ++ struct inode *dir; ++ ++ dir = a->dst_dir; ++ SiMustAnyLock(dir->i_sb); ++ if (!au_nhash_test_longer_wh(&a->whlist, a->btgt, ++ au_sbi(dir->i_sb)->si_dirwh) ++ || au_test_fs_remote(a->h_dst->d_sb)) { ++ err = au_whtmp_rmdir(dir, a->btgt, a->h_dst, &a->whlist); ++ if (unlikely(err)) ++ pr_warn("failed removing whtmp dir %pd (%d), " ++ "ignored.\n", a->h_dst, err); ++ } else { ++ au_nhash_wh_free(&a->thargs->whlist); ++ a->thargs->whlist = a->whlist; ++ a->whlist.nh_num = 0; ++ au_whtmp_kick_rmdir(dir, a->btgt, a->h_dst, a->thargs); ++ dput(a->h_dst); ++ a->thargs = NULL; ++ } ++ ++ return 0; ++} ++ ++/* make it 'opaque' dir. */ ++static int au_ren_diropq(struct au_ren_args *a) ++{ ++ int err; ++ struct dentry *diropq; ++ ++ err = 0; ++ a->src_bdiropq = au_dbdiropq(a->src_dentry); ++ a->src_hinode = au_hi(a->src_inode, a->btgt); ++ au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD); ++ diropq = au_diropq_create(a->src_dentry, a->btgt); ++ au_hn_imtx_unlock(a->src_hinode); ++ if (IS_ERR(diropq)) ++ err = PTR_ERR(diropq); ++ else ++ dput(diropq); ++ ++ return err; ++} ++ ++static int do_rename(struct au_ren_args *a) ++{ ++ int err; ++ struct dentry *d, *h_d; ++ ++ /* prepare workqueue args for asynchronous rmdir */ ++ h_d = a->dst_h_dentry; ++ if (au_ftest_ren(a->flags, ISDIR) && d_is_positive(h_d)) { ++ err = -ENOMEM; ++ a->thargs = au_whtmp_rmdir_alloc(a->src_dentry->d_sb, GFP_NOFS); ++ if (unlikely(!a->thargs)) ++ goto out; ++ a->h_dst = dget(h_d); ++ } ++ ++ /* create whiteout for src_dentry */ ++ if (au_ftest_ren(a->flags, WHSRC)) { ++ a->src_bwh = au_dbwh(a->src_dentry); ++ AuDebugOn(a->src_bwh >= 0); ++ a->src_wh_dentry ++ = au_wh_create(a->src_dentry, a->btgt, a->src_h_parent); ++ err = PTR_ERR(a->src_wh_dentry); ++ if (IS_ERR(a->src_wh_dentry)) ++ goto out_thargs; ++ } ++ ++ /* lookup whiteout for dentry */ ++ if (au_ftest_ren(a->flags, WHDST)) { ++ h_d = au_wh_lkup(a->dst_h_parent, &a->dst_dentry->d_name, ++ a->br); ++ err = PTR_ERR(h_d); ++ if (IS_ERR(h_d)) ++ goto out_whsrc; ++ if (d_is_negative(h_d)) ++ dput(h_d); ++ else ++ a->dst_wh_dentry = h_d; ++ } ++ ++ /* rename dentry to tmpwh */ ++ if (a->thargs) { ++ err = au_whtmp_ren(a->dst_h_dentry, a->br); ++ if (unlikely(err)) ++ goto out_whdst; ++ ++ d = a->dst_dentry; ++ au_set_h_dptr(d, a->btgt, NULL); ++ err = au_lkup_neg(d, a->btgt, /*wh*/0); ++ if (unlikely(err)) ++ goto out_whtmp; ++ a->dst_h_dentry = au_h_dptr(d, a->btgt); ++ } ++ ++ BUG_ON(d_is_positive(a->dst_h_dentry) && a->src_btop != a->btgt); ++ ++ /* rename by vfs_rename or cpup */ ++ d = a->dst_dentry; ++ if (au_ftest_ren(a->flags, ISDIR) ++ && (a->dst_wh_dentry ++ || au_dbdiropq(d) == a->btgt ++ /* hide the lower to keep xino */ ++ || a->btgt < au_dbbot(d) ++ || au_opt_test(au_mntflags(d->d_sb), ALWAYS_DIROPQ))) ++ au_fset_ren(a->flags, DIROPQ); ++ err = au_ren_or_cpup(a); ++ if (unlikely(err)) ++ /* leave the copied-up one */ ++ goto out_whtmp; ++ ++ /* make dir opaque */ ++ if (au_ftest_ren(a->flags, DIROPQ)) { ++ err = au_ren_diropq(a); ++ if (unlikely(err)) ++ goto out_rename; ++ } ++ ++ /* update target timestamps */ ++ AuDebugOn(au_dbtop(a->src_dentry) != a->btgt); ++ a->h_path.dentry = au_h_dptr(a->src_dentry, a->btgt); ++ vfsub_update_h_iattr(&a->h_path, /*did*/NULL); /*ignore*/ ++ a->src_inode->i_ctime = d_inode(a->h_path.dentry)->i_ctime; ++ ++ /* remove whiteout for dentry */ ++ if (a->dst_wh_dentry) { ++ a->h_path.dentry = a->dst_wh_dentry; ++ err = au_wh_unlink_dentry(a->dst_h_dir, &a->h_path, ++ a->dst_dentry); ++ if (unlikely(err)) ++ goto out_diropq; ++ } ++ ++ /* remove whtmp */ ++ if (a->thargs) ++ au_ren_del_whtmp(a); /* ignore this error */ ++ ++ au_fhsm_wrote(a->src_dentry->d_sb, a->btgt, /*force*/0); ++ err = 0; ++ goto out_success; ++ ++out_diropq: ++ if (au_ftest_ren(a->flags, DIROPQ)) ++ au_ren_rev_diropq(err, a); ++out_rename: ++ au_ren_rev_rename(err, a); ++ dput(a->h_dst); ++out_whtmp: ++ if (a->thargs) ++ au_ren_rev_whtmp(err, a); ++out_whdst: ++ dput(a->dst_wh_dentry); ++ a->dst_wh_dentry = NULL; ++out_whsrc: ++ if (a->src_wh_dentry) ++ au_ren_rev_whsrc(err, a); ++out_success: ++ dput(a->src_wh_dentry); ++ dput(a->dst_wh_dentry); ++out_thargs: ++ if (a->thargs) { ++ dput(a->h_dst); ++ au_whtmp_rmdir_free(a->thargs); ++ a->thargs = NULL; ++ } ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * test if @dentry dir can be rename destination or not. ++ * success means, it is a logically empty dir. ++ */ ++static int may_rename_dstdir(struct dentry *dentry, struct au_nhash *whlist) ++{ ++ return au_test_empty(dentry, whlist); ++} ++ ++/* ++ * test if @dentry dir can be rename source or not. ++ * if it can, return 0 and @children is filled. ++ * success means, ++ * - it is a logically empty dir. ++ * - or, it exists on writable branch and has no children including whiteouts ++ * on the lower branch. ++ */ ++static int may_rename_srcdir(struct dentry *dentry, aufs_bindex_t btgt) ++{ ++ int err; ++ unsigned int rdhash; ++ aufs_bindex_t btop; ++ ++ btop = au_dbtop(dentry); ++ if (btop != btgt) { ++ struct au_nhash whlist; ++ ++ SiMustAnyLock(dentry->d_sb); ++ rdhash = au_sbi(dentry->d_sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, ++ dentry)); ++ err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_test_empty(dentry, &whlist); ++ au_nhash_wh_free(&whlist); ++ goto out; ++ } ++ ++ if (btop == au_dbtaildir(dentry)) ++ return 0; /* success */ ++ ++ err = au_test_empty_lower(dentry); ++ ++out: ++ if (err == -ENOTEMPTY) { ++ AuWarn1("renaming dir who has child(ren) on multiple branches," ++ " is not supported\n"); ++ err = -EXDEV; ++ } ++ return err; ++} ++ ++/* side effect: sets whlist and h_dentry */ ++static int au_ren_may_dir(struct au_ren_args *a) ++{ ++ int err; ++ unsigned int rdhash; ++ struct dentry *d; ++ ++ d = a->dst_dentry; ++ SiMustAnyLock(d->d_sb); ++ ++ err = 0; ++ if (au_ftest_ren(a->flags, ISDIR) && a->dst_inode) { ++ rdhash = au_sbi(d->d_sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, d)); ++ err = au_nhash_alloc(&a->whlist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ ++ au_set_dbtop(d, a->dst_btop); ++ err = may_rename_dstdir(d, &a->whlist); ++ au_set_dbtop(d, a->btgt); ++ } ++ a->dst_h_dentry = au_h_dptr(d, au_dbtop(d)); ++ if (unlikely(err)) ++ goto out; ++ ++ d = a->src_dentry; ++ a->src_h_dentry = au_h_dptr(d, au_dbtop(d)); ++ if (au_ftest_ren(a->flags, ISDIR)) { ++ err = may_rename_srcdir(d, a->btgt); ++ if (unlikely(err)) { ++ au_nhash_wh_free(&a->whlist); ++ a->whlist.nh_num = 0; ++ } ++ } ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * simple tests for rename. ++ * following the checks in vfs, plus the parent-child relationship. ++ */ ++static int au_may_ren(struct au_ren_args *a) ++{ ++ int err, isdir; ++ struct inode *h_inode; ++ ++ if (a->src_btop == a->btgt) { ++ err = au_may_del(a->src_dentry, a->btgt, a->src_h_parent, ++ au_ftest_ren(a->flags, ISDIR)); ++ if (unlikely(err)) ++ goto out; ++ err = -EINVAL; ++ if (unlikely(a->src_h_dentry == a->h_trap)) ++ goto out; ++ } ++ ++ err = 0; ++ if (a->dst_btop != a->btgt) ++ goto out; ++ ++ err = -ENOTEMPTY; ++ if (unlikely(a->dst_h_dentry == a->h_trap)) ++ goto out; ++ ++ err = -EIO; ++ isdir = !!au_ftest_ren(a->flags, ISDIR); ++ if (d_really_is_negative(a->dst_dentry)) { ++ if (d_is_negative(a->dst_h_dentry)) ++ err = au_may_add(a->dst_dentry, a->btgt, ++ a->dst_h_parent, isdir); ++ } else { ++ if (unlikely(d_is_negative(a->dst_h_dentry))) ++ goto out; ++ h_inode = d_inode(a->dst_h_dentry); ++ if (h_inode->i_nlink) ++ err = au_may_del(a->dst_dentry, a->btgt, ++ a->dst_h_parent, isdir); ++ } ++ ++out: ++ if (unlikely(err == -ENOENT || err == -EEXIST)) ++ err = -EIO; ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * locking order ++ * (VFS) ++ * - src_dir and dir by lock_rename() ++ * - inode if exitsts ++ * (aufs) ++ * - lock all ++ * + src_dentry and dentry by aufs_read_and_write_lock2() which calls, ++ * + si_read_lock ++ * + di_write_lock2_child() ++ * + di_write_lock_child() ++ * + ii_write_lock_child() ++ * + di_write_lock_child2() ++ * + ii_write_lock_child2() ++ * + src_parent and parent ++ * + di_write_lock_parent() ++ * + ii_write_lock_parent() ++ * + di_write_lock_parent2() ++ * + ii_write_lock_parent2() ++ * + lower src_dir and dir by vfsub_lock_rename() ++ * + verify the every relationships between child and parent. if any ++ * of them failed, unlock all and return -EBUSY. ++ */ ++static void au_ren_unlock(struct au_ren_args *a) ++{ ++ vfsub_unlock_rename(a->src_h_parent, a->src_hdir, ++ a->dst_h_parent, a->dst_hdir); ++ if (au_ftest_ren(a->flags, MNT_WRITE)) ++ vfsub_mnt_drop_write(au_br_mnt(a->br)); ++} ++ ++static int au_ren_lock(struct au_ren_args *a) ++{ ++ int err; ++ unsigned int udba; ++ ++ err = 0; ++ a->src_h_parent = au_h_dptr(a->src_parent, a->btgt); ++ a->src_hdir = au_hi(a->src_dir, a->btgt); ++ a->dst_h_parent = au_h_dptr(a->dst_parent, a->btgt); ++ a->dst_hdir = au_hi(a->dst_dir, a->btgt); ++ ++ err = vfsub_mnt_want_write(au_br_mnt(a->br)); ++ if (unlikely(err)) ++ goto out; ++ au_fset_ren(a->flags, MNT_WRITE); ++ a->h_trap = vfsub_lock_rename(a->src_h_parent, a->src_hdir, ++ a->dst_h_parent, a->dst_hdir); ++ udba = au_opt_udba(a->src_dentry->d_sb); ++ if (unlikely(a->src_hdir->hi_inode != d_inode(a->src_h_parent) ++ || a->dst_hdir->hi_inode != d_inode(a->dst_h_parent))) ++ err = au_busy_or_stale(); ++ if (!err && au_dbtop(a->src_dentry) == a->btgt) ++ err = au_h_verify(a->src_h_dentry, udba, ++ d_inode(a->src_h_parent), a->src_h_parent, ++ a->br); ++ if (!err && au_dbtop(a->dst_dentry) == a->btgt) ++ err = au_h_verify(a->dst_h_dentry, udba, ++ d_inode(a->dst_h_parent), a->dst_h_parent, ++ a->br); ++ if (!err) ++ goto out; /* success */ ++ ++ err = au_busy_or_stale(); ++ au_ren_unlock(a); ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_ren_refresh_dir(struct au_ren_args *a) ++{ ++ struct inode *dir; ++ ++ dir = a->dst_dir; ++ dir->i_version++; ++ if (au_ftest_ren(a->flags, ISDIR)) { ++ /* is this updating defined in POSIX? */ ++ au_cpup_attr_timesizes(a->src_inode); ++ au_cpup_attr_nlink(dir, /*force*/1); ++ } ++ ++ au_dir_ts(dir, a->btgt); ++ ++ if (au_ftest_ren(a->flags, ISSAMEDIR)) ++ return; ++ ++ dir = a->src_dir; ++ dir->i_version++; ++ if (au_ftest_ren(a->flags, ISDIR)) ++ au_cpup_attr_nlink(dir, /*force*/1); ++ au_dir_ts(dir, a->btgt); ++} ++ ++static void au_ren_refresh(struct au_ren_args *a) ++{ ++ aufs_bindex_t bbot, bindex; ++ struct dentry *d, *h_d; ++ struct inode *i, *h_i; ++ struct super_block *sb; ++ ++ d = a->dst_dentry; ++ d_drop(d); ++ if (a->h_dst) ++ /* already dget-ed by au_ren_or_cpup() */ ++ au_set_h_dptr(d, a->btgt, a->h_dst); ++ ++ i = a->dst_inode; ++ if (i) { ++ if (!au_ftest_ren(a->flags, ISDIR)) ++ vfsub_drop_nlink(i); ++ else { ++ vfsub_dead_dir(i); ++ au_cpup_attr_timesizes(i); ++ } ++ au_update_dbrange(d, /*do_put_zero*/1); ++ } else { ++ bbot = a->btgt; ++ for (bindex = au_dbtop(d); bindex < bbot; bindex++) ++ au_set_h_dptr(d, bindex, NULL); ++ bbot = au_dbbot(d); ++ for (bindex = a->btgt + 1; bindex <= bbot; bindex++) ++ au_set_h_dptr(d, bindex, NULL); ++ au_update_dbrange(d, /*do_put_zero*/0); ++ } ++ ++ d = a->src_dentry; ++ au_set_dbwh(d, -1); ++ bbot = au_dbbot(d); ++ for (bindex = a->btgt + 1; bindex <= bbot; bindex++) { ++ h_d = au_h_dptr(d, bindex); ++ if (h_d) ++ au_set_h_dptr(d, bindex, NULL); ++ } ++ au_set_dbbot(d, a->btgt); ++ ++ sb = d->d_sb; ++ i = a->src_inode; ++ if (au_opt_test(au_mntflags(sb), PLINK) && au_plink_test(i)) ++ return; /* success */ ++ ++ bbot = au_ibbot(i); ++ for (bindex = a->btgt + 1; bindex <= bbot; bindex++) { ++ h_i = au_h_iptr(i, bindex); ++ if (h_i) { ++ au_xino_write(sb, bindex, h_i->i_ino, /*ino*/0); ++ /* ignore this error */ ++ au_set_h_iptr(i, bindex, NULL, 0); ++ } ++ } ++ au_set_ibbot(i, a->btgt); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* mainly for link(2) and rename(2) */ ++int au_wbr(struct dentry *dentry, aufs_bindex_t btgt) ++{ ++ aufs_bindex_t bdiropq, bwh; ++ struct dentry *parent; ++ struct au_branch *br; ++ ++ parent = dentry->d_parent; ++ IMustLock(d_inode(parent)); /* dir is locked */ ++ ++ bdiropq = au_dbdiropq(parent); ++ bwh = au_dbwh(dentry); ++ br = au_sbr(dentry->d_sb, btgt); ++ if (au_br_rdonly(br) ++ || (0 <= bdiropq && bdiropq < btgt) ++ || (0 <= bwh && bwh < btgt)) ++ btgt = -1; ++ ++ AuDbg("btgt %d\n", btgt); ++ return btgt; ++} ++ ++/* sets src_btop, dst_btop and btgt */ ++static int au_ren_wbr(struct au_ren_args *a) ++{ ++ int err; ++ struct au_wr_dir_args wr_dir_args = { ++ /* .force_btgt = -1, */ ++ .flags = AuWrDir_ADD_ENTRY ++ }; ++ ++ a->src_btop = au_dbtop(a->src_dentry); ++ a->dst_btop = au_dbtop(a->dst_dentry); ++ if (au_ftest_ren(a->flags, ISDIR)) ++ au_fset_wrdir(wr_dir_args.flags, ISDIR); ++ wr_dir_args.force_btgt = a->src_btop; ++ if (a->dst_inode && a->dst_btop < a->src_btop) ++ wr_dir_args.force_btgt = a->dst_btop; ++ wr_dir_args.force_btgt = au_wbr(a->dst_dentry, wr_dir_args.force_btgt); ++ err = au_wr_dir(a->dst_dentry, a->src_dentry, &wr_dir_args); ++ a->btgt = err; ++ ++ return err; ++} ++ ++static void au_ren_dt(struct au_ren_args *a) ++{ ++ a->h_path.dentry = a->src_h_parent; ++ au_dtime_store(a->src_dt + AuPARENT, a->src_parent, &a->h_path); ++ if (!au_ftest_ren(a->flags, ISSAMEDIR)) { ++ a->h_path.dentry = a->dst_h_parent; ++ au_dtime_store(a->dst_dt + AuPARENT, a->dst_parent, &a->h_path); ++ } ++ ++ au_fclr_ren(a->flags, DT_DSTDIR); ++ if (!au_ftest_ren(a->flags, ISDIR)) ++ return; ++ ++ a->h_path.dentry = a->src_h_dentry; ++ au_dtime_store(a->src_dt + AuCHILD, a->src_dentry, &a->h_path); ++ if (d_is_positive(a->dst_h_dentry)) { ++ au_fset_ren(a->flags, DT_DSTDIR); ++ a->h_path.dentry = a->dst_h_dentry; ++ au_dtime_store(a->dst_dt + AuCHILD, a->dst_dentry, &a->h_path); ++ } ++} ++ ++static void au_ren_rev_dt(int err, struct au_ren_args *a) ++{ ++ struct dentry *h_d; ++ struct inode *h_inode; ++ ++ au_dtime_revert(a->src_dt + AuPARENT); ++ if (!au_ftest_ren(a->flags, ISSAMEDIR)) ++ au_dtime_revert(a->dst_dt + AuPARENT); ++ ++ if (au_ftest_ren(a->flags, ISDIR) && err != -EIO) { ++ h_d = a->src_dt[AuCHILD].dt_h_path.dentry; ++ h_inode = d_inode(h_d); ++ inode_lock_nested(h_inode, AuLsc_I_CHILD); ++ au_dtime_revert(a->src_dt + AuCHILD); ++ inode_unlock(h_inode); ++ ++ if (au_ftest_ren(a->flags, DT_DSTDIR)) { ++ h_d = a->dst_dt[AuCHILD].dt_h_path.dentry; ++ h_inode = d_inode(h_d); ++ inode_lock_nested(h_inode, AuLsc_I_CHILD); ++ au_dtime_revert(a->dst_dt + AuCHILD); ++ inode_unlock(h_inode); ++ } ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int aufs_rename(struct inode *_src_dir, struct dentry *_src_dentry, ++ struct inode *_dst_dir, struct dentry *_dst_dentry) ++{ ++ int err, flags; ++ /* reduce stack space */ ++ struct au_ren_args *a; ++ ++ AuDbg("%pd, %pd\n", _src_dentry, _dst_dentry); ++ IMustLock(_src_dir); ++ IMustLock(_dst_dir); ++ ++ err = -ENOMEM; ++ BUILD_BUG_ON(sizeof(*a) > PAGE_SIZE); ++ a = kzalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ a->src_dir = _src_dir; ++ a->src_dentry = _src_dentry; ++ a->src_inode = NULL; ++ if (d_really_is_positive(a->src_dentry)) ++ a->src_inode = d_inode(a->src_dentry); ++ a->src_parent = a->src_dentry->d_parent; /* dir inode is locked */ ++ a->dst_dir = _dst_dir; ++ a->dst_dentry = _dst_dentry; ++ a->dst_inode = NULL; ++ if (d_really_is_positive(a->dst_dentry)) ++ a->dst_inode = d_inode(a->dst_dentry); ++ a->dst_parent = a->dst_dentry->d_parent; /* dir inode is locked */ ++ if (a->dst_inode) { ++ IMustLock(a->dst_inode); ++ au_igrab(a->dst_inode); ++ } ++ ++ err = -ENOTDIR; ++ flags = AuLock_FLUSH | AuLock_NOPLM | AuLock_GEN; ++ if (d_is_dir(a->src_dentry)) { ++ au_fset_ren(a->flags, ISDIR); ++ if (unlikely(d_really_is_positive(a->dst_dentry) ++ && !d_is_dir(a->dst_dentry))) ++ goto out_free; ++ flags |= AuLock_DIRS; ++ } ++ err = aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry, flags); ++ if (unlikely(err)) ++ goto out_free; ++ ++ err = au_d_hashed_positive(a->src_dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ err = -ENOENT; ++ if (a->dst_inode) { ++ /* ++ * If it is a dir, VFS unhash dst_dentry before this ++ * function. It means we cannot rely upon d_unhashed(). ++ */ ++ if (unlikely(!a->dst_inode->i_nlink)) ++ goto out_unlock; ++ if (!S_ISDIR(a->dst_inode->i_mode)) { ++ err = au_d_hashed_positive(a->dst_dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ } else if (unlikely(IS_DEADDIR(a->dst_inode))) ++ goto out_unlock; ++ } else if (unlikely(d_unhashed(a->dst_dentry))) ++ goto out_unlock; ++ ++ /* ++ * is it possible? ++ * yes, it happened (in linux-3.3-rcN) but I don't know why. ++ * there may exist a problem somewhere else. ++ */ ++ err = -EINVAL; ++ if (unlikely(d_inode(a->dst_parent) == d_inode(a->src_dentry))) ++ goto out_unlock; ++ ++ au_fset_ren(a->flags, ISSAMEDIR); /* temporary */ ++ di_write_lock_parent(a->dst_parent); ++ ++ /* which branch we process */ ++ err = au_ren_wbr(a); ++ if (unlikely(err < 0)) ++ goto out_parent; ++ a->br = au_sbr(a->dst_dentry->d_sb, a->btgt); ++ a->h_path.mnt = au_br_mnt(a->br); ++ ++ /* are they available to be renamed */ ++ err = au_ren_may_dir(a); ++ if (unlikely(err)) ++ goto out_children; ++ ++ /* prepare the writable parent dir on the same branch */ ++ if (a->dst_btop == a->btgt) { ++ au_fset_ren(a->flags, WHDST); ++ } else { ++ err = au_cpup_dirs(a->dst_dentry, a->btgt); ++ if (unlikely(err)) ++ goto out_children; ++ } ++ ++ if (a->src_dir != a->dst_dir) { ++ /* ++ * this temporary unlock is safe, ++ * because both dir->i_mutex are locked. ++ */ ++ di_write_unlock(a->dst_parent); ++ di_write_lock_parent(a->src_parent); ++ err = au_wr_dir_need_wh(a->src_dentry, ++ au_ftest_ren(a->flags, ISDIR), ++ &a->btgt); ++ di_write_unlock(a->src_parent); ++ di_write_lock2_parent(a->src_parent, a->dst_parent, /*isdir*/1); ++ au_fclr_ren(a->flags, ISSAMEDIR); ++ } else ++ err = au_wr_dir_need_wh(a->src_dentry, ++ au_ftest_ren(a->flags, ISDIR), ++ &a->btgt); ++ if (unlikely(err < 0)) ++ goto out_children; ++ if (err) ++ au_fset_ren(a->flags, WHSRC); ++ ++ /* cpup src */ ++ if (a->src_btop != a->btgt) { ++ struct au_pin pin; ++ ++ err = au_pin(&pin, a->src_dentry, a->btgt, ++ au_opt_udba(a->src_dentry->d_sb), ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (!err) { ++ struct au_cp_generic cpg = { ++ .dentry = a->src_dentry, ++ .bdst = a->btgt, ++ .bsrc = a->src_btop, ++ .len = -1, ++ .pin = &pin, ++ .flags = AuCpup_DTIME | AuCpup_HOPEN ++ }; ++ AuDebugOn(au_dbtop(a->src_dentry) != a->src_btop); ++ err = au_sio_cpup_simple(&cpg); ++ au_unpin(&pin); ++ } ++ if (unlikely(err)) ++ goto out_children; ++ a->src_btop = a->btgt; ++ a->src_h_dentry = au_h_dptr(a->src_dentry, a->btgt); ++ au_fset_ren(a->flags, WHSRC); ++ } ++ ++ /* lock them all */ ++ err = au_ren_lock(a); ++ if (unlikely(err)) ++ /* leave the copied-up one */ ++ goto out_children; ++ ++ if (!au_opt_test(au_mntflags(a->dst_dir->i_sb), UDBA_NONE)) ++ err = au_may_ren(a); ++ else if (unlikely(a->dst_dentry->d_name.len > AUFS_MAX_NAMELEN)) ++ err = -ENAMETOOLONG; ++ if (unlikely(err)) ++ goto out_hdir; ++ ++ /* store timestamps to be revertible */ ++ au_ren_dt(a); ++ ++ /* here we go */ ++ err = do_rename(a); ++ if (unlikely(err)) ++ goto out_dt; ++ ++ /* update dir attributes */ ++ au_ren_refresh_dir(a); ++ ++ /* dput/iput all lower dentries */ ++ au_ren_refresh(a); ++ ++ goto out_hdir; /* success */ ++ ++out_dt: ++ au_ren_rev_dt(err, a); ++out_hdir: ++ au_ren_unlock(a); ++out_children: ++ au_nhash_wh_free(&a->whlist); ++ if (err && a->dst_inode && a->dst_btop != a->btgt) { ++ AuDbg("btop %d, btgt %d\n", a->dst_btop, a->btgt); ++ au_set_h_dptr(a->dst_dentry, a->btgt, NULL); ++ au_set_dbtop(a->dst_dentry, a->dst_btop); ++ } ++out_parent: ++ if (!err) ++ d_move(a->src_dentry, a->dst_dentry); ++ else { ++ au_update_dbtop(a->dst_dentry); ++ if (!a->dst_inode) ++ d_drop(a->dst_dentry); ++ } ++ if (au_ftest_ren(a->flags, ISSAMEDIR)) ++ di_write_unlock(a->dst_parent); ++ else ++ di_write_unlock2(a->src_parent, a->dst_parent); ++out_unlock: ++ aufs_read_and_write_unlock2(a->dst_dentry, a->src_dentry); ++out_free: ++ iput(a->dst_inode); ++ if (a->thargs) ++ au_whtmp_rmdir_free(a->thargs); ++ kfree(a); ++out: ++ AuTraceErr(err); ++ return err; ++} +diff --git a/fs/aufs/iinfo.c b/fs/aufs/iinfo.c +new file mode 100644 +index 0000000..29023ce +--- /dev/null ++++ b/fs/aufs/iinfo.c +@@ -0,0 +1,280 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * inode private data ++ */ ++ ++#include "aufs.h" ++ ++struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex) ++{ ++ struct inode *h_inode; ++ struct au_hinode *hinode; ++ ++ IiMustAnyLock(inode); ++ ++ hinode = au_hinode(au_ii(inode), bindex); ++ h_inode = hinode->hi_inode; ++ AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0); ++ return h_inode; ++} ++ ++/* todo: hard/soft set? */ ++void au_hiput(struct au_hinode *hinode) ++{ ++ au_hn_free(hinode); ++ dput(hinode->hi_whdentry); ++ iput(hinode->hi_inode); ++} ++ ++unsigned int au_hi_flags(struct inode *inode, int isdir) ++{ ++ unsigned int flags; ++ const unsigned int mnt_flags = au_mntflags(inode->i_sb); ++ ++ flags = 0; ++ if (au_opt_test(mnt_flags, XINO)) ++ au_fset_hi(flags, XINO); ++ if (isdir && au_opt_test(mnt_flags, UDBA_HNOTIFY)) ++ au_fset_hi(flags, HNOTIFY); ++ return flags; ++} ++ ++void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex, ++ struct inode *h_inode, unsigned int flags) ++{ ++ struct au_hinode *hinode; ++ struct inode *hi; ++ struct au_iinfo *iinfo = au_ii(inode); ++ ++ IiMustWriteLock(inode); ++ ++ hinode = au_hinode(iinfo, bindex); ++ hi = hinode->hi_inode; ++ AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0); ++ ++ if (hi) ++ au_hiput(hinode); ++ hinode->hi_inode = h_inode; ++ if (h_inode) { ++ int err; ++ struct super_block *sb = inode->i_sb; ++ struct au_branch *br; ++ ++ AuDebugOn(inode->i_mode ++ && (h_inode->i_mode & S_IFMT) ++ != (inode->i_mode & S_IFMT)); ++ if (bindex == iinfo->ii_btop) ++ au_cpup_igen(inode, h_inode); ++ br = au_sbr(sb, bindex); ++ hinode->hi_id = br->br_id; ++ if (au_ftest_hi(flags, XINO)) { ++ err = au_xino_write(sb, bindex, h_inode->i_ino, ++ inode->i_ino); ++ if (unlikely(err)) ++ AuIOErr1("failed au_xino_write() %d\n", err); ++ } ++ ++ if (au_ftest_hi(flags, HNOTIFY) ++ && au_br_hnotifyable(br->br_perm)) { ++ err = au_hn_alloc(hinode, inode); ++ if (unlikely(err)) ++ AuIOErr1("au_hn_alloc() %d\n", err); ++ } ++ } ++} ++ ++void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_wh) ++{ ++ struct au_hinode *hinode; ++ ++ IiMustWriteLock(inode); ++ ++ hinode = au_hinode(au_ii(inode), bindex); ++ AuDebugOn(hinode->hi_whdentry); ++ hinode->hi_whdentry = h_wh; ++} ++ ++void au_update_iigen(struct inode *inode, int half) ++{ ++ struct au_iinfo *iinfo; ++ struct au_iigen *iigen; ++ unsigned int sigen; ++ ++ sigen = au_sigen(inode->i_sb); ++ iinfo = au_ii(inode); ++ iigen = &iinfo->ii_generation; ++ spin_lock(&iigen->ig_spin); ++ iigen->ig_generation = sigen; ++ if (half) ++ au_ig_fset(iigen->ig_flags, HALF_REFRESHED); ++ else ++ au_ig_fclr(iigen->ig_flags, HALF_REFRESHED); ++ spin_unlock(&iigen->ig_spin); ++} ++ ++/* it may be called at remount time, too */ ++void au_update_ibrange(struct inode *inode, int do_put_zero) ++{ ++ struct au_iinfo *iinfo; ++ aufs_bindex_t bindex, bbot; ++ ++ AuDebugOn(is_bad_inode(inode)); ++ IiMustWriteLock(inode); ++ ++ iinfo = au_ii(inode); ++ if (do_put_zero && iinfo->ii_btop >= 0) { ++ for (bindex = iinfo->ii_btop; bindex <= iinfo->ii_bbot; ++ bindex++) { ++ struct inode *h_i; ++ ++ h_i = au_hinode(iinfo, bindex)->hi_inode; ++ if (h_i ++ && !h_i->i_nlink ++ && !(h_i->i_state & I_LINKABLE)) ++ au_set_h_iptr(inode, bindex, NULL, 0); ++ } ++ } ++ ++ iinfo->ii_btop = -1; ++ iinfo->ii_bbot = -1; ++ bbot = au_sbbot(inode->i_sb); ++ for (bindex = 0; bindex <= bbot; bindex++) ++ if (au_hinode(iinfo, bindex)->hi_inode) { ++ iinfo->ii_btop = bindex; ++ break; ++ } ++ if (iinfo->ii_btop >= 0) ++ for (bindex = bbot; bindex >= iinfo->ii_btop; bindex--) ++ if (au_hinode(iinfo, bindex)->hi_inode) { ++ iinfo->ii_bbot = bindex; ++ break; ++ } ++ AuDebugOn(iinfo->ii_btop > iinfo->ii_bbot); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_icntnr_init_once(void *_c) ++{ ++ struct au_icntnr *c = _c; ++ struct au_iinfo *iinfo = &c->iinfo; ++ ++ spin_lock_init(&iinfo->ii_generation.ig_spin); ++ au_rw_init(&iinfo->ii_rwsem); ++ inode_init_once(&c->vfs_inode); ++} ++ ++void au_hinode_init(struct au_hinode *hinode) ++{ ++ hinode->hi_inode = NULL; ++ hinode->hi_id = -1; ++ au_hn_init(hinode); ++ hinode->hi_whdentry = NULL; ++} ++ ++int au_iinfo_init(struct inode *inode) ++{ ++ struct au_iinfo *iinfo; ++ struct super_block *sb; ++ int nbr, i; ++ ++ sb = inode->i_sb; ++ iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo); ++ nbr = au_sbbot(sb) + 1; ++ if (unlikely(nbr <= 0)) ++ nbr = 1; ++ iinfo->ii_hinode = kmalloc_array(nbr, sizeof(*iinfo->ii_hinode), ++ GFP_NOFS); ++ if (iinfo->ii_hinode) { ++ au_ninodes_inc(sb); ++ for (i = 0; i < nbr; i++) ++ au_hinode_init(iinfo->ii_hinode + i); ++ ++ iinfo->ii_generation.ig_generation = au_sigen(sb); ++ iinfo->ii_btop = -1; ++ iinfo->ii_bbot = -1; ++ iinfo->ii_vdir = NULL; ++ return 0; ++ } ++ return -ENOMEM; ++} ++ ++int au_hinode_realloc(struct au_iinfo *iinfo, int nbr) ++{ ++ int err, i; ++ struct au_hinode *hip; ++ ++ AuRwMustWriteLock(&iinfo->ii_rwsem); ++ ++ err = -ENOMEM; ++ hip = krealloc(iinfo->ii_hinode, sizeof(*hip) * nbr, GFP_NOFS); ++ if (hip) { ++ for (i = iinfo->ii_bbot + 1; i < nbr; i++) ++ au_hinode_init(hip + i); ++ iinfo->ii_hinode = hip; ++ err = 0; ++ } ++ ++ return err; ++} ++ ++void au_iinfo_fin(struct inode *inode) ++{ ++ struct au_iinfo *iinfo; ++ struct au_hinode *hi; ++ struct super_block *sb; ++ aufs_bindex_t bindex, bbot; ++ const unsigned char unlinked = !inode->i_nlink; ++ ++ AuDebugOn(is_bad_inode(inode)); ++ ++ sb = inode->i_sb; ++ au_ninodes_dec(sb); ++ if (si_pid_test(sb)) ++ au_xino_delete_inode(inode, unlinked); ++ else { ++ /* ++ * it is safe to hide the dependency between sbinfo and ++ * sb->s_umount. ++ */ ++ lockdep_off(); ++ si_noflush_read_lock(sb); ++ au_xino_delete_inode(inode, unlinked); ++ si_read_unlock(sb); ++ lockdep_on(); ++ } ++ ++ iinfo = au_ii(inode); ++ if (iinfo->ii_vdir) ++ au_vdir_free(iinfo->ii_vdir); ++ ++ bindex = iinfo->ii_btop; ++ if (bindex >= 0) { ++ hi = au_hinode(iinfo, bindex); ++ bbot = iinfo->ii_bbot; ++ while (bindex++ <= bbot) { ++ if (hi->hi_inode) ++ au_hiput(hi); ++ hi++; ++ } ++ } ++ kfree(iinfo->ii_hinode); ++ AuRwDestroy(&iinfo->ii_rwsem); ++} +diff --git a/fs/aufs/inode.c b/fs/aufs/inode.c +new file mode 100644 +index 0000000..1794574 +--- /dev/null ++++ b/fs/aufs/inode.c +@@ -0,0 +1,517 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * inode functions ++ */ ++ ++#include "aufs.h" ++ ++struct inode *au_igrab(struct inode *inode) ++{ ++ if (inode) { ++ AuDebugOn(!atomic_read(&inode->i_count)); ++ ihold(inode); ++ } ++ return inode; ++} ++ ++static void au_refresh_hinode_attr(struct inode *inode, int do_version) ++{ ++ au_cpup_attr_all(inode, /*force*/0); ++ au_update_iigen(inode, /*half*/1); ++ if (do_version) ++ inode->i_version++; ++} ++ ++static int au_ii_refresh(struct inode *inode, int *update) ++{ ++ int err, e; ++ umode_t type; ++ aufs_bindex_t bindex, new_bindex; ++ struct super_block *sb; ++ struct au_iinfo *iinfo; ++ struct au_hinode *p, *q, tmp; ++ ++ AuDebugOn(is_bad_inode(inode)); ++ IiMustWriteLock(inode); ++ ++ *update = 0; ++ sb = inode->i_sb; ++ type = inode->i_mode & S_IFMT; ++ iinfo = au_ii(inode); ++ err = au_hinode_realloc(iinfo, au_sbbot(sb) + 1); ++ if (unlikely(err)) ++ goto out; ++ ++ AuDebugOn(iinfo->ii_btop < 0); ++ p = au_hinode(iinfo, iinfo->ii_btop); ++ for (bindex = iinfo->ii_btop; bindex <= iinfo->ii_bbot; ++ bindex++, p++) { ++ if (!p->hi_inode) ++ continue; ++ ++ AuDebugOn(type != (p->hi_inode->i_mode & S_IFMT)); ++ new_bindex = au_br_index(sb, p->hi_id); ++ if (new_bindex == bindex) ++ continue; ++ ++ if (new_bindex < 0) { ++ *update = 1; ++ au_hiput(p); ++ p->hi_inode = NULL; ++ continue; ++ } ++ ++ if (new_bindex < iinfo->ii_btop) ++ iinfo->ii_btop = new_bindex; ++ if (iinfo->ii_bbot < new_bindex) ++ iinfo->ii_bbot = new_bindex; ++ /* swap two lower inode, and loop again */ ++ q = au_hinode(iinfo, new_bindex); ++ tmp = *q; ++ *q = *p; ++ *p = tmp; ++ if (tmp.hi_inode) { ++ bindex--; ++ p--; ++ } ++ } ++ au_update_ibrange(inode, /*do_put_zero*/0); ++ e = au_dy_irefresh(inode); ++ if (unlikely(e && !err)) ++ err = e; ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++void au_refresh_iop(struct inode *inode, int force_getattr) ++{ ++ int type; ++ struct au_sbinfo *sbi = au_sbi(inode->i_sb); ++ const struct inode_operations *iop ++ = force_getattr ? aufs_iop : sbi->si_iop_array; ++ ++ if (inode->i_op == iop) ++ return; ++ ++ switch (inode->i_mode & S_IFMT) { ++ case S_IFDIR: ++ type = AuIop_DIR; ++ break; ++ case S_IFLNK: ++ type = AuIop_SYMLINK; ++ break; ++ default: ++ type = AuIop_OTHER; ++ break; ++ } ++ ++ inode->i_op = iop + type; ++ /* unnecessary smp_wmb() */ ++} ++ ++int au_refresh_hinode_self(struct inode *inode) ++{ ++ int err, update; ++ ++ err = au_ii_refresh(inode, &update); ++ if (!err) ++ au_refresh_hinode_attr(inode, update && S_ISDIR(inode->i_mode)); ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++int au_refresh_hinode(struct inode *inode, struct dentry *dentry) ++{ ++ int err, e, update; ++ unsigned int flags; ++ umode_t mode; ++ aufs_bindex_t bindex, bbot; ++ unsigned char isdir; ++ struct au_hinode *p; ++ struct au_iinfo *iinfo; ++ ++ err = au_ii_refresh(inode, &update); ++ if (unlikely(err)) ++ goto out; ++ ++ update = 0; ++ iinfo = au_ii(inode); ++ p = au_hinode(iinfo, iinfo->ii_btop); ++ mode = (inode->i_mode & S_IFMT); ++ isdir = S_ISDIR(mode); ++ flags = au_hi_flags(inode, isdir); ++ bbot = au_dbbot(dentry); ++ for (bindex = au_dbtop(dentry); bindex <= bbot; bindex++) { ++ struct inode *h_i, *h_inode; ++ struct dentry *h_d; ++ ++ h_d = au_h_dptr(dentry, bindex); ++ if (!h_d || d_is_negative(h_d)) ++ continue; ++ ++ h_inode = d_inode(h_d); ++ AuDebugOn(mode != (h_inode->i_mode & S_IFMT)); ++ if (iinfo->ii_btop <= bindex && bindex <= iinfo->ii_bbot) { ++ h_i = au_h_iptr(inode, bindex); ++ if (h_i) { ++ if (h_i == h_inode) ++ continue; ++ err = -EIO; ++ break; ++ } ++ } ++ if (bindex < iinfo->ii_btop) ++ iinfo->ii_btop = bindex; ++ if (iinfo->ii_bbot < bindex) ++ iinfo->ii_bbot = bindex; ++ au_set_h_iptr(inode, bindex, au_igrab(h_inode), flags); ++ update = 1; ++ } ++ au_update_ibrange(inode, /*do_put_zero*/0); ++ e = au_dy_irefresh(inode); ++ if (unlikely(e && !err)) ++ err = e; ++ if (!err) ++ au_refresh_hinode_attr(inode, update && isdir); ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int set_inode(struct inode *inode, struct dentry *dentry) ++{ ++ int err; ++ unsigned int flags; ++ umode_t mode; ++ aufs_bindex_t bindex, btop, btail; ++ unsigned char isdir; ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ struct au_iinfo *iinfo; ++ struct inode_operations *iop; ++ ++ IiMustWriteLock(inode); ++ ++ err = 0; ++ isdir = 0; ++ iop = au_sbi(inode->i_sb)->si_iop_array; ++ btop = au_dbtop(dentry); ++ h_dentry = au_h_dptr(dentry, btop); ++ h_inode = d_inode(h_dentry); ++ mode = h_inode->i_mode; ++ switch (mode & S_IFMT) { ++ case S_IFREG: ++ btail = au_dbtail(dentry); ++ inode->i_op = iop + AuIop_OTHER; ++ inode->i_fop = &aufs_file_fop; ++ err = au_dy_iaop(inode, btop, h_inode); ++ if (unlikely(err)) ++ goto out; ++ break; ++ case S_IFDIR: ++ isdir = 1; ++ btail = au_dbtaildir(dentry); ++ inode->i_op = iop + AuIop_DIR; ++ inode->i_fop = &aufs_dir_fop; ++ break; ++ case S_IFLNK: ++ btail = au_dbtail(dentry); ++ inode->i_op = iop + AuIop_SYMLINK; ++ break; ++ case S_IFBLK: ++ case S_IFCHR: ++ case S_IFIFO: ++ case S_IFSOCK: ++ btail = au_dbtail(dentry); ++ inode->i_op = iop + AuIop_OTHER; ++ init_special_inode(inode, mode, h_inode->i_rdev); ++ break; ++ default: ++ AuIOErr("Unknown file type 0%o\n", mode); ++ err = -EIO; ++ goto out; ++ } ++ ++ /* do not set hnotify for whiteouted dirs (SHWH mode) */ ++ flags = au_hi_flags(inode, isdir); ++ if (au_opt_test(au_mntflags(dentry->d_sb), SHWH) ++ && au_ftest_hi(flags, HNOTIFY) ++ && dentry->d_name.len > AUFS_WH_PFX_LEN ++ && !memcmp(dentry->d_name.name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) ++ au_fclr_hi(flags, HNOTIFY); ++ iinfo = au_ii(inode); ++ iinfo->ii_btop = btop; ++ iinfo->ii_bbot = btail; ++ for (bindex = btop; bindex <= btail; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry) ++ au_set_h_iptr(inode, bindex, ++ au_igrab(d_inode(h_dentry)), flags); ++ } ++ au_cpup_attr_all(inode, /*force*/1); ++ /* ++ * to force calling aufs_get_acl() every time, ++ * do not call cache_no_acl() for aufs inode. ++ */ ++ ++out: ++ return err; ++} ++ ++/* ++ * successful returns with iinfo write_locked ++ * minus: errno ++ * zero: success, matched ++ * plus: no error, but unmatched ++ */ ++static int reval_inode(struct inode *inode, struct dentry *dentry) ++{ ++ int err; ++ unsigned int gen, igflags; ++ aufs_bindex_t bindex, bbot; ++ struct inode *h_inode, *h_dinode; ++ struct dentry *h_dentry; ++ ++ /* ++ * before this function, if aufs got any iinfo lock, it must be only ++ * one, the parent dir. ++ * it can happen by UDBA and the obsoleted inode number. ++ */ ++ err = -EIO; ++ if (unlikely(inode->i_ino == parent_ino(dentry))) ++ goto out; ++ ++ err = 1; ++ ii_write_lock_new_child(inode); ++ h_dentry = au_h_dptr(dentry, au_dbtop(dentry)); ++ h_dinode = d_inode(h_dentry); ++ bbot = au_ibbot(inode); ++ for (bindex = au_ibtop(inode); bindex <= bbot; bindex++) { ++ h_inode = au_h_iptr(inode, bindex); ++ if (!h_inode || h_inode != h_dinode) ++ continue; ++ ++ err = 0; ++ gen = au_iigen(inode, &igflags); ++ if (gen == au_digen(dentry) ++ && !au_ig_ftest(igflags, HALF_REFRESHED)) ++ break; ++ ++ /* fully refresh inode using dentry */ ++ err = au_refresh_hinode(inode, dentry); ++ if (!err) ++ au_update_iigen(inode, /*half*/0); ++ break; ++ } ++ ++ if (unlikely(err)) ++ ii_write_unlock(inode); ++out: ++ return err; ++} ++ ++int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ unsigned int d_type, ino_t *ino) ++{ ++ int err; ++ struct mutex *mtx; ++ ++ /* prevent hardlinked inode number from race condition */ ++ mtx = NULL; ++ if (d_type != DT_DIR) { ++ mtx = &au_sbr(sb, bindex)->br_xino.xi_nondir_mtx; ++ mutex_lock(mtx); ++ } ++ err = au_xino_read(sb, bindex, h_ino, ino); ++ if (unlikely(err)) ++ goto out; ++ ++ if (!*ino) { ++ err = -EIO; ++ *ino = au_xino_new_ino(sb); ++ if (unlikely(!*ino)) ++ goto out; ++ err = au_xino_write(sb, bindex, h_ino, *ino); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++out: ++ if (mtx) ++ mutex_unlock(mtx); ++ return err; ++} ++ ++/* successful returns with iinfo write_locked */ ++/* todo: return with unlocked? */ ++struct inode *au_new_inode(struct dentry *dentry, int must_new) ++{ ++ struct inode *inode, *h_inode; ++ struct dentry *h_dentry; ++ struct super_block *sb; ++ struct mutex *mtx; ++ ino_t h_ino, ino; ++ int err; ++ aufs_bindex_t btop; ++ ++ sb = dentry->d_sb; ++ btop = au_dbtop(dentry); ++ h_dentry = au_h_dptr(dentry, btop); ++ h_inode = d_inode(h_dentry); ++ h_ino = h_inode->i_ino; ++ ++ /* ++ * stop 'race'-ing between hardlinks under different ++ * parents. ++ */ ++ mtx = NULL; ++ if (!d_is_dir(h_dentry)) ++ mtx = &au_sbr(sb, btop)->br_xino.xi_nondir_mtx; ++ ++new_ino: ++ if (mtx) ++ mutex_lock(mtx); ++ err = au_xino_read(sb, btop, h_ino, &ino); ++ inode = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ ++ if (!ino) { ++ ino = au_xino_new_ino(sb); ++ if (unlikely(!ino)) { ++ inode = ERR_PTR(-EIO); ++ goto out; ++ } ++ } ++ ++ AuDbg("i%lu\n", (unsigned long)ino); ++ inode = au_iget_locked(sb, ino); ++ err = PTR_ERR(inode); ++ if (IS_ERR(inode)) ++ goto out; ++ ++ AuDbg("%lx, new %d\n", inode->i_state, !!(inode->i_state & I_NEW)); ++ if (inode->i_state & I_NEW) { ++ ii_write_lock_new_child(inode); ++ err = set_inode(inode, dentry); ++ if (!err) { ++ unlock_new_inode(inode); ++ goto out; /* success */ ++ } ++ ++ /* ++ * iget_failed() calls iput(), but we need to call ++ * ii_write_unlock() after iget_failed(). so dirty hack for ++ * i_count. ++ */ ++ atomic_inc(&inode->i_count); ++ iget_failed(inode); ++ ii_write_unlock(inode); ++ au_xino_write(sb, btop, h_ino, /*ino*/0); ++ /* ignore this error */ ++ goto out_iput; ++ } else if (!must_new && !IS_DEADDIR(inode) && inode->i_nlink) { ++ /* ++ * horrible race condition between lookup, readdir and copyup ++ * (or something). ++ */ ++ if (mtx) ++ mutex_unlock(mtx); ++ err = reval_inode(inode, dentry); ++ if (unlikely(err < 0)) { ++ mtx = NULL; ++ goto out_iput; ++ } ++ ++ if (!err) { ++ mtx = NULL; ++ goto out; /* success */ ++ } else if (mtx) ++ mutex_lock(mtx); ++ } ++ ++ if (unlikely(au_test_fs_unique_ino(h_inode))) ++ AuWarn1("Warning: Un-notified UDBA or repeatedly renamed dir," ++ " b%d, %s, %pd, hi%lu, i%lu.\n", ++ btop, au_sbtype(h_dentry->d_sb), dentry, ++ (unsigned long)h_ino, (unsigned long)ino); ++ ino = 0; ++ err = au_xino_write(sb, btop, h_ino, /*ino*/0); ++ if (!err) { ++ iput(inode); ++ if (mtx) ++ mutex_unlock(mtx); ++ goto new_ino; ++ } ++ ++out_iput: ++ iput(inode); ++ inode = ERR_PTR(err); ++out: ++ if (mtx) ++ mutex_unlock(mtx); ++ return inode; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_test_ro(struct super_block *sb, aufs_bindex_t bindex, ++ struct inode *inode) ++{ ++ int err; ++ struct inode *hi; ++ ++ err = au_br_rdonly(au_sbr(sb, bindex)); ++ ++ /* pseudo-link after flushed may happen out of bounds */ ++ if (!err ++ && inode ++ && au_ibtop(inode) <= bindex ++ && bindex <= au_ibbot(inode)) { ++ /* ++ * permission check is unnecessary since vfsub routine ++ * will be called later ++ */ ++ hi = au_h_iptr(inode, bindex); ++ if (hi) ++ err = IS_IMMUTABLE(hi) ? -EROFS : 0; ++ } ++ ++ return err; ++} ++ ++int au_test_h_perm(struct inode *h_inode, int mask) ++{ ++ if (uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) ++ return 0; ++ return inode_permission(h_inode, mask); ++} ++ ++int au_test_h_perm_sio(struct inode *h_inode, int mask) ++{ ++ if (au_test_nfs(h_inode->i_sb) ++ && (mask & MAY_WRITE) ++ && S_ISDIR(h_inode->i_mode)) ++ mask |= MAY_READ; /* force permission check */ ++ return au_test_h_perm(h_inode, mask); ++} +diff --git a/fs/aufs/inode.h b/fs/aufs/inode.h +new file mode 100644 +index 0000000..38db636 +--- /dev/null ++++ b/fs/aufs/inode.h +@@ -0,0 +1,689 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * inode operations ++ */ ++ ++#ifndef __AUFS_INODE_H__ ++#define __AUFS_INODE_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include "rwsem.h" ++ ++struct vfsmount; ++ ++struct au_hnotify { ++#ifdef CONFIG_AUFS_HNOTIFY ++#ifdef CONFIG_AUFS_HFSNOTIFY ++ /* never use fsnotify_add_vfsmount_mark() */ ++ struct fsnotify_mark hn_mark; ++#endif ++ struct inode *hn_aufs_inode; /* no get/put */ ++#endif ++} ____cacheline_aligned_in_smp; ++ ++struct au_hinode { ++ struct inode *hi_inode; ++ aufs_bindex_t hi_id; ++#ifdef CONFIG_AUFS_HNOTIFY ++ struct au_hnotify *hi_notify; ++#endif ++ ++ /* reference to the copied-up whiteout with get/put */ ++ struct dentry *hi_whdentry; ++}; ++ ++/* ig_flags */ ++#define AuIG_HALF_REFRESHED 1 ++#define au_ig_ftest(flags, name) ((flags) & AuIG_##name) ++#define au_ig_fset(flags, name) \ ++ do { (flags) |= AuIG_##name; } while (0) ++#define au_ig_fclr(flags, name) \ ++ do { (flags) &= ~AuIG_##name; } while (0) ++ ++struct au_iigen { ++ spinlock_t ig_spin; ++ __u32 ig_generation, ig_flags; ++}; ++ ++struct au_vdir; ++struct au_iinfo { ++ struct au_iigen ii_generation; ++ struct super_block *ii_hsb1; /* no get/put */ ++ ++ struct au_rwsem ii_rwsem; ++ aufs_bindex_t ii_btop, ii_bbot; ++ __u32 ii_higen; ++ struct au_hinode *ii_hinode; ++ struct au_vdir *ii_vdir; ++}; ++ ++struct au_icntnr { ++ struct au_iinfo iinfo; ++ struct inode vfs_inode; ++ struct hlist_node plink; ++} ____cacheline_aligned_in_smp; ++ ++/* au_pin flags */ ++#define AuPin_DI_LOCKED 1 ++#define AuPin_MNT_WRITE (1 << 1) ++#define au_ftest_pin(flags, name) ((flags) & AuPin_##name) ++#define au_fset_pin(flags, name) \ ++ do { (flags) |= AuPin_##name; } while (0) ++#define au_fclr_pin(flags, name) \ ++ do { (flags) &= ~AuPin_##name; } while (0) ++ ++struct au_pin { ++ /* input */ ++ struct dentry *dentry; ++ unsigned int udba; ++ unsigned char lsc_di, lsc_hi, flags; ++ aufs_bindex_t bindex; ++ ++ /* output */ ++ struct dentry *parent; ++ struct au_hinode *hdir; ++ struct vfsmount *h_mnt; ++ ++ /* temporary unlock/relock for copyup */ ++ struct dentry *h_dentry, *h_parent; ++ struct au_branch *br; ++ struct task_struct *task; ++}; ++ ++void au_pin_hdir_unlock(struct au_pin *p); ++int au_pin_hdir_lock(struct au_pin *p); ++int au_pin_hdir_relock(struct au_pin *p); ++void au_pin_hdir_set_owner(struct au_pin *p, struct task_struct *task); ++void au_pin_hdir_acquire_nest(struct au_pin *p); ++void au_pin_hdir_release(struct au_pin *p); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct au_iinfo *au_ii(struct inode *inode) ++{ ++ BUG_ON(is_bad_inode(inode)); ++ return &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* inode.c */ ++struct inode *au_igrab(struct inode *inode); ++void au_refresh_iop(struct inode *inode, int force_getattr); ++int au_refresh_hinode_self(struct inode *inode); ++int au_refresh_hinode(struct inode *inode, struct dentry *dentry); ++int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ unsigned int d_type, ino_t *ino); ++struct inode *au_new_inode(struct dentry *dentry, int must_new); ++int au_test_ro(struct super_block *sb, aufs_bindex_t bindex, ++ struct inode *inode); ++int au_test_h_perm(struct inode *h_inode, int mask); ++int au_test_h_perm_sio(struct inode *h_inode, int mask); ++ ++static inline int au_wh_ino(struct super_block *sb, aufs_bindex_t bindex, ++ ino_t h_ino, unsigned int d_type, ino_t *ino) ++{ ++#ifdef CONFIG_AUFS_SHWH ++ return au_ino(sb, bindex, h_ino, d_type, ino); ++#else ++ return 0; ++#endif ++} ++ ++/* i_op.c */ ++enum { ++ AuIop_SYMLINK, ++ AuIop_DIR, ++ AuIop_OTHER, ++ AuIop_Last ++}; ++extern struct inode_operations aufs_iop[AuIop_Last], ++ aufs_iop_nogetattr[AuIop_Last]; ++ ++/* au_wr_dir flags */ ++#define AuWrDir_ADD_ENTRY 1 ++#define AuWrDir_ISDIR (1 << 1) ++#define AuWrDir_TMPFILE (1 << 2) ++#define au_ftest_wrdir(flags, name) ((flags) & AuWrDir_##name) ++#define au_fset_wrdir(flags, name) \ ++ do { (flags) |= AuWrDir_##name; } while (0) ++#define au_fclr_wrdir(flags, name) \ ++ do { (flags) &= ~AuWrDir_##name; } while (0) ++ ++struct au_wr_dir_args { ++ aufs_bindex_t force_btgt; ++ unsigned char flags; ++}; ++int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry, ++ struct au_wr_dir_args *args); ++ ++struct dentry *au_pinned_h_parent(struct au_pin *pin); ++void au_pin_init(struct au_pin *pin, struct dentry *dentry, ++ aufs_bindex_t bindex, int lsc_di, int lsc_hi, ++ unsigned int udba, unsigned char flags); ++int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int udba, unsigned char flags) __must_check; ++int au_do_pin(struct au_pin *pin) __must_check; ++void au_unpin(struct au_pin *pin); ++int au_reval_for_attr(struct dentry *dentry, unsigned int sigen); ++ ++#define AuIcpup_DID_CPUP 1 ++#define au_ftest_icpup(flags, name) ((flags) & AuIcpup_##name) ++#define au_fset_icpup(flags, name) \ ++ do { (flags) |= AuIcpup_##name; } while (0) ++#define au_fclr_icpup(flags, name) \ ++ do { (flags) &= ~AuIcpup_##name; } while (0) ++ ++struct au_icpup_args { ++ unsigned char flags; ++ unsigned char pin_flags; ++ aufs_bindex_t btgt; ++ unsigned int udba; ++ struct au_pin pin; ++ struct path h_path; ++ struct inode *h_inode; ++}; ++ ++int au_pin_and_icpup(struct dentry *dentry, struct iattr *ia, ++ struct au_icpup_args *a); ++ ++int au_h_path_getattr(struct dentry *dentry, int force, struct path *h_path); ++ ++/* i_op_add.c */ ++int au_may_add(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent, int isdir); ++int aufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, ++ dev_t dev); ++int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname); ++int aufs_create(struct inode *dir, struct dentry *dentry, umode_t mode, ++ bool want_excl); ++struct vfsub_aopen_args; ++int au_aopen_or_create(struct inode *dir, struct dentry *dentry, ++ struct vfsub_aopen_args *args); ++int aufs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode); ++int aufs_link(struct dentry *src_dentry, struct inode *dir, ++ struct dentry *dentry); ++int aufs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); ++ ++/* i_op_del.c */ ++int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup); ++int au_may_del(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent, int isdir); ++int aufs_unlink(struct inode *dir, struct dentry *dentry); ++int aufs_rmdir(struct inode *dir, struct dentry *dentry); ++ ++/* i_op_ren.c */ ++int au_wbr(struct dentry *dentry, aufs_bindex_t btgt); ++int aufs_rename(struct inode *src_dir, struct dentry *src_dentry, ++ struct inode *dir, struct dentry *dentry); ++ ++/* iinfo.c */ ++struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex); ++void au_hiput(struct au_hinode *hinode); ++void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_wh); ++unsigned int au_hi_flags(struct inode *inode, int isdir); ++ ++/* hinode flags */ ++#define AuHi_XINO 1 ++#define AuHi_HNOTIFY (1 << 1) ++#define au_ftest_hi(flags, name) ((flags) & AuHi_##name) ++#define au_fset_hi(flags, name) \ ++ do { (flags) |= AuHi_##name; } while (0) ++#define au_fclr_hi(flags, name) \ ++ do { (flags) &= ~AuHi_##name; } while (0) ++ ++#ifndef CONFIG_AUFS_HNOTIFY ++#undef AuHi_HNOTIFY ++#define AuHi_HNOTIFY 0 ++#endif ++ ++void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex, ++ struct inode *h_inode, unsigned int flags); ++ ++void au_update_iigen(struct inode *inode, int half); ++void au_update_ibrange(struct inode *inode, int do_put_zero); ++ ++void au_icntnr_init_once(void *_c); ++void au_hinode_init(struct au_hinode *hinode); ++int au_iinfo_init(struct inode *inode); ++void au_iinfo_fin(struct inode *inode); ++int au_hinode_realloc(struct au_iinfo *iinfo, int nbr); ++ ++#ifdef CONFIG_PROC_FS ++/* plink.c */ ++int au_plink_maint(struct super_block *sb, int flags); ++struct au_sbinfo; ++void au_plink_maint_leave(struct au_sbinfo *sbinfo); ++int au_plink_maint_enter(struct super_block *sb); ++#ifdef CONFIG_AUFS_DEBUG ++void au_plink_list(struct super_block *sb); ++#else ++AuStubVoid(au_plink_list, struct super_block *sb) ++#endif ++int au_plink_test(struct inode *inode); ++struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex); ++void au_plink_append(struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_dentry); ++void au_plink_put(struct super_block *sb, int verbose); ++void au_plink_clean(struct super_block *sb, int verbose); ++void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id); ++#else ++AuStubInt0(au_plink_maint, struct super_block *sb, int flags); ++AuStubVoid(au_plink_maint_leave, struct au_sbinfo *sbinfo); ++AuStubInt0(au_plink_maint_enter, struct super_block *sb); ++AuStubVoid(au_plink_list, struct super_block *sb); ++AuStubInt0(au_plink_test, struct inode *inode); ++AuStub(struct dentry *, au_plink_lkup, return NULL, ++ struct inode *inode, aufs_bindex_t bindex); ++AuStubVoid(au_plink_append, struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_dentry); ++AuStubVoid(au_plink_put, struct super_block *sb, int verbose); ++AuStubVoid(au_plink_clean, struct super_block *sb, int verbose); ++AuStubVoid(au_plink_half_refresh, struct super_block *sb, aufs_bindex_t br_id); ++#endif /* CONFIG_PROC_FS */ ++ ++#ifdef CONFIG_AUFS_XATTR ++/* xattr.c */ ++int au_cpup_xattr(struct dentry *h_dst, struct dentry *h_src, int ignore_flags, ++ unsigned int verbose); ++ssize_t aufs_listxattr(struct dentry *dentry, char *list, size_t size); ++ssize_t aufs_getxattr(struct dentry *dentry, const char *name, void *value, ++ size_t size); ++int aufs_setxattr(struct dentry *dentry, const char *name, const void *value, ++ size_t size, int flags); ++int aufs_removexattr(struct dentry *dentry, const char *name); ++ ++/* void au_xattr_init(struct super_block *sb); */ ++#else ++AuStubInt0(au_cpup_xattr, struct dentry *h_dst, struct dentry *h_src, ++ int ignore_flags, unsigned int verbose); ++/* AuStubVoid(au_xattr_init, struct super_block *sb); */ ++#endif ++ ++#ifdef CONFIG_FS_POSIX_ACL ++struct posix_acl *aufs_get_acl(struct inode *inode, int type); ++int aufs_set_acl(struct inode *inode, struct posix_acl *acl, int type); ++#endif ++ ++#if IS_ENABLED(CONFIG_AUFS_XATTR) || IS_ENABLED(CONFIG_FS_POSIX_ACL) ++enum { ++ AU_XATTR_SET, ++ AU_XATTR_REMOVE, ++ AU_ACL_SET ++}; ++ ++struct au_srxattr { ++ int type; ++ union { ++ struct { ++ const char *name; ++ const void *value; ++ size_t size; ++ int flags; ++ } set; ++ struct { ++ const char *name; ++ } remove; ++ struct { ++ struct posix_acl *acl; ++ int type; ++ } acl_set; ++ } u; ++}; ++ssize_t au_srxattr(struct dentry *dentry, struct au_srxattr *arg); ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* lock subclass for iinfo */ ++enum { ++ AuLsc_II_CHILD, /* child first */ ++ AuLsc_II_CHILD2, /* rename(2), link(2), and cpup at hnotify */ ++ AuLsc_II_CHILD3, /* copyup dirs */ ++ AuLsc_II_PARENT, /* see AuLsc_I_PARENT in vfsub.h */ ++ AuLsc_II_PARENT2, ++ AuLsc_II_PARENT3, /* copyup dirs */ ++ AuLsc_II_NEW_CHILD ++}; ++ ++/* ++ * ii_read_lock_child, ii_write_lock_child, ++ * ii_read_lock_child2, ii_write_lock_child2, ++ * ii_read_lock_child3, ii_write_lock_child3, ++ * ii_read_lock_parent, ii_write_lock_parent, ++ * ii_read_lock_parent2, ii_write_lock_parent2, ++ * ii_read_lock_parent3, ii_write_lock_parent3, ++ * ii_read_lock_new_child, ii_write_lock_new_child, ++ */ ++#define AuReadLockFunc(name, lsc) \ ++static inline void ii_read_lock_##name(struct inode *i) \ ++{ \ ++ au_rw_read_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \ ++} ++ ++#define AuWriteLockFunc(name, lsc) \ ++static inline void ii_write_lock_##name(struct inode *i) \ ++{ \ ++ au_rw_write_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \ ++} ++ ++#define AuRWLockFuncs(name, lsc) \ ++ AuReadLockFunc(name, lsc) \ ++ AuWriteLockFunc(name, lsc) ++ ++AuRWLockFuncs(child, CHILD); ++AuRWLockFuncs(child2, CHILD2); ++AuRWLockFuncs(child3, CHILD3); ++AuRWLockFuncs(parent, PARENT); ++AuRWLockFuncs(parent2, PARENT2); ++AuRWLockFuncs(parent3, PARENT3); ++AuRWLockFuncs(new_child, NEW_CHILD); ++ ++#undef AuReadLockFunc ++#undef AuWriteLockFunc ++#undef AuRWLockFuncs ++ ++/* ++ * ii_read_unlock, ii_write_unlock, ii_downgrade_lock ++ */ ++AuSimpleUnlockRwsemFuncs(ii, struct inode *i, &au_ii(i)->ii_rwsem); ++ ++#define IiMustNoWaiters(i) AuRwMustNoWaiters(&au_ii(i)->ii_rwsem) ++#define IiMustAnyLock(i) AuRwMustAnyLock(&au_ii(i)->ii_rwsem) ++#define IiMustWriteLock(i) AuRwMustWriteLock(&au_ii(i)->ii_rwsem) ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline void au_icntnr_init(struct au_icntnr *c) ++{ ++#ifdef CONFIG_AUFS_DEBUG ++ c->vfs_inode.i_mode = 0; ++#endif ++} ++ ++static inline unsigned int au_iigen(struct inode *inode, unsigned int *igflags) ++{ ++ unsigned int gen; ++ struct au_iinfo *iinfo; ++ struct au_iigen *iigen; ++ ++ iinfo = au_ii(inode); ++ iigen = &iinfo->ii_generation; ++ spin_lock(&iigen->ig_spin); ++ if (igflags) ++ *igflags = iigen->ig_flags; ++ gen = iigen->ig_generation; ++ spin_unlock(&iigen->ig_spin); ++ ++ return gen; ++} ++ ++/* tiny test for inode number */ ++/* tmpfs generation is too rough */ ++static inline int au_test_higen(struct inode *inode, struct inode *h_inode) ++{ ++ struct au_iinfo *iinfo; ++ ++ iinfo = au_ii(inode); ++ AuRwMustAnyLock(&iinfo->ii_rwsem); ++ return !(iinfo->ii_hsb1 == h_inode->i_sb ++ && iinfo->ii_higen == h_inode->i_generation); ++} ++ ++static inline void au_iigen_dec(struct inode *inode) ++{ ++ struct au_iinfo *iinfo; ++ struct au_iigen *iigen; ++ ++ iinfo = au_ii(inode); ++ iigen = &iinfo->ii_generation; ++ spin_lock(&iigen->ig_spin); ++ iigen->ig_generation--; ++ spin_unlock(&iigen->ig_spin); ++} ++ ++static inline int au_iigen_test(struct inode *inode, unsigned int sigen) ++{ ++ int err; ++ ++ err = 0; ++ if (unlikely(inode && au_iigen(inode, NULL) != sigen)) ++ err = -EIO; ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct au_hinode *au_hinode(struct au_iinfo *iinfo, ++ aufs_bindex_t bindex) ++{ ++ return iinfo->ii_hinode + bindex; ++} ++ ++static inline aufs_bindex_t au_ii_br_id(struct inode *inode, ++ aufs_bindex_t bindex) ++{ ++ IiMustAnyLock(inode); ++ return au_hinode(au_ii(inode), bindex)->hi_id; ++} ++ ++static inline aufs_bindex_t au_ibtop(struct inode *inode) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_btop; ++} ++ ++static inline aufs_bindex_t au_ibbot(struct inode *inode) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_bbot; ++} ++ ++static inline struct au_vdir *au_ivdir(struct inode *inode) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_vdir; ++} ++ ++static inline struct dentry *au_hi_wh(struct inode *inode, aufs_bindex_t bindex) ++{ ++ IiMustAnyLock(inode); ++ return au_hinode(au_ii(inode), bindex)->hi_whdentry; ++} ++ ++static inline void au_set_ibtop(struct inode *inode, aufs_bindex_t bindex) ++{ ++ IiMustWriteLock(inode); ++ au_ii(inode)->ii_btop = bindex; ++} ++ ++static inline void au_set_ibbot(struct inode *inode, aufs_bindex_t bindex) ++{ ++ IiMustWriteLock(inode); ++ au_ii(inode)->ii_bbot = bindex; ++} ++ ++static inline void au_set_ivdir(struct inode *inode, struct au_vdir *vdir) ++{ ++ IiMustWriteLock(inode); ++ au_ii(inode)->ii_vdir = vdir; ++} ++ ++static inline struct au_hinode *au_hi(struct inode *inode, aufs_bindex_t bindex) ++{ ++ IiMustAnyLock(inode); ++ return au_hinode(au_ii(inode), bindex); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct dentry *au_pinned_parent(struct au_pin *pin) ++{ ++ if (pin) ++ return pin->parent; ++ return NULL; ++} ++ ++static inline struct inode *au_pinned_h_dir(struct au_pin *pin) ++{ ++ if (pin && pin->hdir) ++ return pin->hdir->hi_inode; ++ return NULL; ++} ++ ++static inline struct au_hinode *au_pinned_hdir(struct au_pin *pin) ++{ ++ if (pin) ++ return pin->hdir; ++ return NULL; ++} ++ ++static inline void au_pin_set_dentry(struct au_pin *pin, struct dentry *dentry) ++{ ++ if (pin) ++ pin->dentry = dentry; ++} ++ ++static inline void au_pin_set_parent_lflag(struct au_pin *pin, ++ unsigned char lflag) ++{ ++ if (pin) { ++ if (lflag) ++ au_fset_pin(pin->flags, DI_LOCKED); ++ else ++ au_fclr_pin(pin->flags, DI_LOCKED); ++ } ++} ++ ++#if 0 /* reserved */ ++static inline void au_pin_set_parent(struct au_pin *pin, struct dentry *parent) ++{ ++ if (pin) { ++ dput(pin->parent); ++ pin->parent = dget(parent); ++ } ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_branch; ++#ifdef CONFIG_AUFS_HNOTIFY ++struct au_hnotify_op { ++ void (*ctl)(struct au_hinode *hinode, int do_set); ++ int (*alloc)(struct au_hinode *hinode); ++ ++ /* ++ * if it returns true, the the caller should free hinode->hi_notify, ++ * otherwise ->free() frees it. ++ */ ++ int (*free)(struct au_hinode *hinode, ++ struct au_hnotify *hn) __must_check; ++ ++ void (*fin)(void); ++ int (*init)(void); ++ ++ int (*reset_br)(unsigned int udba, struct au_branch *br, int perm); ++ void (*fin_br)(struct au_branch *br); ++ int (*init_br)(struct au_branch *br, int perm); ++}; ++ ++/* hnotify.c */ ++int au_hn_alloc(struct au_hinode *hinode, struct inode *inode); ++void au_hn_free(struct au_hinode *hinode); ++void au_hn_ctl(struct au_hinode *hinode, int do_set); ++void au_hn_reset(struct inode *inode, unsigned int flags); ++int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask, ++ struct qstr *h_child_qstr, struct inode *h_child_inode); ++int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm); ++int au_hnotify_init_br(struct au_branch *br, int perm); ++void au_hnotify_fin_br(struct au_branch *br); ++int __init au_hnotify_init(void); ++void au_hnotify_fin(void); ++ ++/* hfsnotify.c */ ++extern const struct au_hnotify_op au_hnotify_op; ++ ++static inline ++void au_hn_init(struct au_hinode *hinode) ++{ ++ hinode->hi_notify = NULL; ++} ++ ++static inline struct au_hnotify *au_hn(struct au_hinode *hinode) ++{ ++ return hinode->hi_notify; ++} ++ ++#else ++AuStub(int, au_hn_alloc, return -EOPNOTSUPP, ++ struct au_hinode *hinode __maybe_unused, ++ struct inode *inode __maybe_unused) ++AuStub(struct au_hnotify *, au_hn, return NULL, struct au_hinode *hinode) ++AuStubVoid(au_hn_free, struct au_hinode *hinode __maybe_unused) ++AuStubVoid(au_hn_ctl, struct au_hinode *hinode __maybe_unused, ++ int do_set __maybe_unused) ++AuStubVoid(au_hn_reset, struct inode *inode __maybe_unused, ++ unsigned int flags __maybe_unused) ++AuStubInt0(au_hnotify_reset_br, unsigned int udba __maybe_unused, ++ struct au_branch *br __maybe_unused, ++ int perm __maybe_unused) ++AuStubInt0(au_hnotify_init_br, struct au_branch *br __maybe_unused, ++ int perm __maybe_unused) ++AuStubVoid(au_hnotify_fin_br, struct au_branch *br __maybe_unused) ++AuStubInt0(__init au_hnotify_init, void) ++AuStubVoid(au_hnotify_fin, void) ++AuStubVoid(au_hn_init, struct au_hinode *hinode __maybe_unused) ++#endif /* CONFIG_AUFS_HNOTIFY */ ++ ++static inline void au_hn_suspend(struct au_hinode *hdir) ++{ ++ au_hn_ctl(hdir, /*do_set*/0); ++} ++ ++static inline void au_hn_resume(struct au_hinode *hdir) ++{ ++ au_hn_ctl(hdir, /*do_set*/1); ++} ++ ++static inline void au_hn_imtx_lock(struct au_hinode *hdir) ++{ ++ inode_lock(hdir->hi_inode); ++ au_hn_suspend(hdir); ++} ++ ++static inline void au_hn_imtx_lock_nested(struct au_hinode *hdir, ++ unsigned int sc __maybe_unused) ++{ ++ inode_lock_nested(hdir->hi_inode, sc); ++ au_hn_suspend(hdir); ++} ++ ++static inline void au_hn_imtx_unlock(struct au_hinode *hdir) ++{ ++ au_hn_resume(hdir); ++ inode_unlock(hdir->hi_inode); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_INODE_H__ */ +diff --git a/fs/aufs/ioctl.c b/fs/aufs/ioctl.c +new file mode 100644 +index 0000000..fc5529b +--- /dev/null ++++ b/fs/aufs/ioctl.c +@@ -0,0 +1,219 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * ioctl ++ * plink-management and readdir in userspace. ++ * assist the pathconf(3) wrapper library. ++ * move-down ++ * File-based Hierarchical Storage Management. ++ */ ++ ++#include ++#include ++#include "aufs.h" ++ ++static int au_wbr_fd(struct path *path, struct aufs_wbr_fd __user *arg) ++{ ++ int err, fd; ++ aufs_bindex_t wbi, bindex, bbot; ++ struct file *h_file; ++ struct super_block *sb; ++ struct dentry *root; ++ struct au_branch *br; ++ struct aufs_wbr_fd wbrfd = { ++ .oflags = au_dir_roflags, ++ .brid = -1 ++ }; ++ const int valid = O_RDONLY | O_NONBLOCK | O_LARGEFILE | O_DIRECTORY ++ | O_NOATIME | O_CLOEXEC; ++ ++ AuDebugOn(wbrfd.oflags & ~valid); ++ ++ if (arg) { ++ err = copy_from_user(&wbrfd, arg, sizeof(wbrfd)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ goto out; ++ } ++ ++ err = -EINVAL; ++ AuDbg("wbrfd{0%o, %d}\n", wbrfd.oflags, wbrfd.brid); ++ wbrfd.oflags |= au_dir_roflags; ++ AuDbg("0%o\n", wbrfd.oflags); ++ if (unlikely(wbrfd.oflags & ~valid)) ++ goto out; ++ } ++ ++ fd = get_unused_fd_flags(0); ++ err = fd; ++ if (unlikely(fd < 0)) ++ goto out; ++ ++ h_file = ERR_PTR(-EINVAL); ++ wbi = 0; ++ br = NULL; ++ sb = path->dentry->d_sb; ++ root = sb->s_root; ++ aufs_read_lock(root, AuLock_IR); ++ bbot = au_sbbot(sb); ++ if (wbrfd.brid >= 0) { ++ wbi = au_br_index(sb, wbrfd.brid); ++ if (unlikely(wbi < 0 || wbi > bbot)) ++ goto out_unlock; ++ } ++ ++ h_file = ERR_PTR(-ENOENT); ++ br = au_sbr(sb, wbi); ++ if (!au_br_writable(br->br_perm)) { ++ if (arg) ++ goto out_unlock; ++ ++ bindex = wbi + 1; ++ wbi = -1; ++ for (; bindex <= bbot; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (au_br_writable(br->br_perm)) { ++ wbi = bindex; ++ br = au_sbr(sb, wbi); ++ break; ++ } ++ } ++ } ++ AuDbg("wbi %d\n", wbi); ++ if (wbi >= 0) ++ h_file = au_h_open(root, wbi, wbrfd.oflags, NULL, ++ /*force_wr*/0); ++ ++out_unlock: ++ aufs_read_unlock(root, AuLock_IR); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out_fd; ++ ++ au_br_put(br); /* cf. au_h_open() */ ++ fd_install(fd, h_file); ++ err = fd; ++ goto out; /* success */ ++ ++out_fd: ++ put_unused_fd(fd); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ long err; ++ struct dentry *dentry; ++ ++ switch (cmd) { ++ case AUFS_CTL_RDU: ++ case AUFS_CTL_RDU_INO: ++ err = au_rdu_ioctl(file, cmd, arg); ++ break; ++ ++ case AUFS_CTL_WBR_FD: ++ err = au_wbr_fd(&file->f_path, (void __user *)arg); ++ break; ++ ++ case AUFS_CTL_IBUSY: ++ err = au_ibusy_ioctl(file, arg); ++ break; ++ ++ case AUFS_CTL_BRINFO: ++ err = au_brinfo_ioctl(file, arg); ++ break; ++ ++ case AUFS_CTL_FHSM_FD: ++ dentry = file->f_path.dentry; ++ if (IS_ROOT(dentry)) ++ err = au_fhsm_fd(dentry->d_sb, arg); ++ else ++ err = -ENOTTY; ++ break; ++ ++ default: ++ /* do not call the lower */ ++ AuDbg("0x%x\n", cmd); ++ err = -ENOTTY; ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ long err; ++ ++ switch (cmd) { ++ case AUFS_CTL_MVDOWN: ++ err = au_mvdown(file->f_path.dentry, (void __user *)arg); ++ break; ++ ++ case AUFS_CTL_WBR_FD: ++ err = au_wbr_fd(&file->f_path, (void __user *)arg); ++ break; ++ ++ default: ++ /* do not call the lower */ ++ AuDbg("0x%x\n", cmd); ++ err = -ENOTTY; ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++#ifdef CONFIG_COMPAT ++long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ long err; ++ ++ switch (cmd) { ++ case AUFS_CTL_RDU: ++ case AUFS_CTL_RDU_INO: ++ err = au_rdu_compat_ioctl(file, cmd, arg); ++ break; ++ ++ case AUFS_CTL_IBUSY: ++ err = au_ibusy_compat_ioctl(file, arg); ++ break; ++ ++ case AUFS_CTL_BRINFO: ++ err = au_brinfo_compat_ioctl(file, arg); ++ break; ++ ++ default: ++ err = aufs_ioctl_dir(file, cmd, arg); ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++long aufs_compat_ioctl_nondir(struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ return aufs_ioctl_nondir(file, cmd, (unsigned long)compat_ptr(arg)); ++} ++#endif +diff --git a/fs/aufs/loop.c b/fs/aufs/loop.c +new file mode 100644 +index 0000000..8a69754 +--- /dev/null ++++ b/fs/aufs/loop.c +@@ -0,0 +1,146 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * support for loopback block device as a branch ++ */ ++ ++#include "aufs.h" ++ ++/* added into drivers/block/loop.c */ ++static struct file *(*backing_file_func)(struct super_block *sb); ++ ++/* ++ * test if two lower dentries have overlapping branches. ++ */ ++int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding) ++{ ++ struct super_block *h_sb; ++ struct file *backing_file; ++ ++ if (unlikely(!backing_file_func)) { ++ /* don't load "loop" module here */ ++ backing_file_func = symbol_get(loop_backing_file); ++ if (unlikely(!backing_file_func)) ++ /* "loop" module is not loaded */ ++ return 0; ++ } ++ ++ h_sb = h_adding->d_sb; ++ backing_file = backing_file_func(h_sb); ++ if (!backing_file) ++ return 0; ++ ++ h_adding = backing_file->f_path.dentry; ++ /* ++ * h_adding can be local NFS. ++ * in this case aufs cannot detect the loop. ++ */ ++ if (unlikely(h_adding->d_sb == sb)) ++ return 1; ++ return !!au_test_subdir(h_adding, sb->s_root); ++} ++ ++/* true if a kernel thread named 'loop[0-9].*' accesses a file */ ++int au_test_loopback_kthread(void) ++{ ++ int ret; ++ struct task_struct *tsk = current; ++ char c, comm[sizeof(tsk->comm)]; ++ ++ ret = 0; ++ if (tsk->flags & PF_KTHREAD) { ++ get_task_comm(comm, tsk); ++ c = comm[4]; ++ ret = ('0' <= c && c <= '9' ++ && !strncmp(comm, "loop", 4)); ++ } ++ ++ return ret; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define au_warn_loopback_step 16 ++static int au_warn_loopback_nelem = au_warn_loopback_step; ++static unsigned long *au_warn_loopback_array; ++ ++void au_warn_loopback(struct super_block *h_sb) ++{ ++ int i, new_nelem; ++ unsigned long *a, magic; ++ static DEFINE_SPINLOCK(spin); ++ ++ magic = h_sb->s_magic; ++ spin_lock(&spin); ++ a = au_warn_loopback_array; ++ for (i = 0; i < au_warn_loopback_nelem && *a; i++) ++ if (a[i] == magic) { ++ spin_unlock(&spin); ++ return; ++ } ++ ++ /* h_sb is new to us, print it */ ++ if (i < au_warn_loopback_nelem) { ++ a[i] = magic; ++ goto pr; ++ } ++ ++ /* expand the array */ ++ new_nelem = au_warn_loopback_nelem + au_warn_loopback_step; ++ a = au_kzrealloc(au_warn_loopback_array, ++ au_warn_loopback_nelem * sizeof(unsigned long), ++ new_nelem * sizeof(unsigned long), GFP_ATOMIC); ++ if (a) { ++ au_warn_loopback_nelem = new_nelem; ++ au_warn_loopback_array = a; ++ a[i] = magic; ++ goto pr; ++ } ++ ++ spin_unlock(&spin); ++ AuWarn1("realloc failed, ignored\n"); ++ return; ++ ++pr: ++ spin_unlock(&spin); ++ pr_warn("you may want to try another patch for loopback file " ++ "on %s(0x%lx) branch\n", au_sbtype(h_sb), magic); ++} ++ ++int au_loopback_init(void) ++{ ++ int err; ++ struct super_block *sb __maybe_unused; ++ ++ BUILD_BUG_ON(sizeof(sb->s_magic) != sizeof(unsigned long)); ++ ++ err = 0; ++ au_warn_loopback_array = kcalloc(au_warn_loopback_step, ++ sizeof(unsigned long), GFP_NOFS); ++ if (unlikely(!au_warn_loopback_array)) ++ err = -ENOMEM; ++ ++ return err; ++} ++ ++void au_loopback_fin(void) ++{ ++ if (backing_file_func) ++ symbol_put(loop_backing_file); ++ kfree(au_warn_loopback_array); ++} +diff --git a/fs/aufs/loop.h b/fs/aufs/loop.h +new file mode 100644 +index 0000000..35f7446 +--- /dev/null ++++ b/fs/aufs/loop.h +@@ -0,0 +1,52 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * support for loopback mount as a branch ++ */ ++ ++#ifndef __AUFS_LOOP_H__ ++#define __AUFS_LOOP_H__ ++ ++#ifdef __KERNEL__ ++ ++struct dentry; ++struct super_block; ++ ++#ifdef CONFIG_AUFS_BDEV_LOOP ++/* drivers/block/loop.c */ ++struct file *loop_backing_file(struct super_block *sb); ++ ++/* loop.c */ ++int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding); ++int au_test_loopback_kthread(void); ++void au_warn_loopback(struct super_block *h_sb); ++ ++int au_loopback_init(void); ++void au_loopback_fin(void); ++#else ++AuStubInt0(au_test_loopback_overlap, struct super_block *sb, ++ struct dentry *h_adding) ++AuStubInt0(au_test_loopback_kthread, void) ++AuStubVoid(au_warn_loopback, struct super_block *h_sb) ++ ++AuStubInt0(au_loopback_init, void) ++AuStubVoid(au_loopback_fin, void) ++#endif /* BLK_DEV_LOOP */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_LOOP_H__ */ +diff --git a/fs/aufs/magic.mk b/fs/aufs/magic.mk +new file mode 100644 +index 0000000..4f83bdf +--- /dev/null ++++ b/fs/aufs/magic.mk +@@ -0,0 +1,30 @@ ++ ++# defined in ${srctree}/fs/fuse/inode.c ++# tristate ++ifdef CONFIG_FUSE_FS ++ccflags-y += -DFUSE_SUPER_MAGIC=0x65735546 ++endif ++ ++# defined in ${srctree}/fs/xfs/xfs_sb.h ++# tristate ++ifdef CONFIG_XFS_FS ++ccflags-y += -DXFS_SB_MAGIC=0x58465342 ++endif ++ ++# defined in ${srctree}/fs/configfs/mount.c ++# tristate ++ifdef CONFIG_CONFIGFS_FS ++ccflags-y += -DCONFIGFS_MAGIC=0x62656570 ++endif ++ ++# defined in ${srctree}/fs/ubifs/ubifs.h ++# tristate ++ifdef CONFIG_UBIFS_FS ++ccflags-y += -DUBIFS_SUPER_MAGIC=0x24051905 ++endif ++ ++# defined in ${srctree}/fs/hfsplus/hfsplus_raw.h ++# tristate ++ifdef CONFIG_HFSPLUS_FS ++ccflags-y += -DHFSPLUS_SUPER_MAGIC=0x482b ++endif +diff --git a/fs/aufs/module.c b/fs/aufs/module.c +new file mode 100644 +index 0000000..4a2e668 +--- /dev/null ++++ b/fs/aufs/module.c +@@ -0,0 +1,223 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * module global variables and operations ++ */ ++ ++#include ++#include ++#include "aufs.h" ++ ++void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp) ++{ ++ if (new_sz <= nused) ++ return p; ++ ++ p = krealloc(p, new_sz, gfp); ++ if (p) ++ memset(p + nused, 0, new_sz - nused); ++ return p; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * aufs caches ++ */ ++struct kmem_cache *au_cachep[AuCache_Last] = { ++ [0] = NULL ++}; ++ ++static void au_cache_fin(void) ++{ ++ int i; ++ ++ /* ++ * Make sure all delayed rcu free inodes are flushed before we ++ * destroy cache. ++ */ ++ rcu_barrier(); ++ ++ /* excluding AuCache_HNOTIFY */ ++ BUILD_BUG_ON(AuCache_HNOTIFY + 1 != AuCache_Last); ++ for (i = 0; i < AuCache_HNOTIFY; i++) { ++ kmem_cache_destroy(au_cachep[i]); ++ au_cachep[i] = NULL; ++ } ++} ++ ++static int __init au_cache_init(void) ++{ ++ au_cachep[AuCache_DINFO] = AuCacheCtor(au_dinfo, au_di_init_once); ++ if (au_cachep[AuCache_DINFO]) ++ /* SLAB_DESTROY_BY_RCU */ ++ au_cachep[AuCache_ICNTNR] = AuCacheCtor(au_icntnr, ++ au_icntnr_init_once); ++ if (au_cachep[AuCache_ICNTNR]) ++ au_cachep[AuCache_FINFO] = AuCacheCtor(au_finfo, ++ au_fi_init_once); ++ if (au_cachep[AuCache_FINFO]) ++ au_cachep[AuCache_VDIR] = AuCache(au_vdir); ++ if (au_cachep[AuCache_VDIR]) ++ au_cachep[AuCache_DEHSTR] = AuCache(au_vdir_dehstr); ++ if (au_cachep[AuCache_DEHSTR]) ++ return 0; ++ ++ au_cache_fin(); ++ return -ENOMEM; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_dir_roflags; ++ ++#ifdef CONFIG_AUFS_SBILIST ++/* ++ * iterate_supers_type() doesn't protect us from ++ * remounting (branch management) ++ */ ++struct au_sphlhead au_sbilist; ++#endif ++ ++/* ++ * functions for module interface. ++ */ ++MODULE_LICENSE("GPL"); ++/* MODULE_LICENSE("GPL v2"); */ ++MODULE_AUTHOR("Junjiro R. Okajima "); ++MODULE_DESCRIPTION(AUFS_NAME ++ " -- Advanced multi layered unification filesystem"); ++MODULE_VERSION(AUFS_VERSION); ++MODULE_ALIAS_FS(AUFS_NAME); ++ ++/* this module parameter has no meaning when SYSFS is disabled */ ++int sysaufs_brs = 1; ++MODULE_PARM_DESC(brs, "use /fs/aufs/si_*/brN"); ++module_param_named(brs, sysaufs_brs, int, S_IRUGO); ++ ++/* this module parameter has no meaning when USER_NS is disabled */ ++bool au_userns; ++MODULE_PARM_DESC(allow_userns, "allow unprivileged to mount under userns"); ++module_param_named(allow_userns, au_userns, bool, S_IRUGO); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static char au_esc_chars[0x20 + 3]; /* 0x01-0x20, backslash, del, and NULL */ ++ ++int au_seq_path(struct seq_file *seq, struct path *path) ++{ ++ int err; ++ ++ err = seq_path(seq, path, au_esc_chars); ++ if (err > 0) ++ err = 0; ++ else if (err < 0) ++ err = -ENOMEM; ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int __init aufs_init(void) ++{ ++ int err, i; ++ char *p; ++ ++ p = au_esc_chars; ++ for (i = 1; i <= ' '; i++) ++ *p++ = i; ++ *p++ = '\\'; ++ *p++ = '\x7f'; ++ *p = 0; ++ ++ au_dir_roflags = au_file_roflags(O_DIRECTORY | O_LARGEFILE); ++ ++ memcpy(aufs_iop_nogetattr, aufs_iop, sizeof(aufs_iop)); ++ for (i = 0; i < AuIop_Last; i++) ++ aufs_iop_nogetattr[i].getattr = NULL; ++ ++ au_sbilist_init(); ++ sysaufs_brs_init(); ++ au_debug_init(); ++ au_dy_init(); ++ err = sysaufs_init(); ++ if (unlikely(err)) ++ goto out; ++ err = au_procfs_init(); ++ if (unlikely(err)) ++ goto out_sysaufs; ++ err = au_wkq_init(); ++ if (unlikely(err)) ++ goto out_procfs; ++ err = au_loopback_init(); ++ if (unlikely(err)) ++ goto out_wkq; ++ err = au_hnotify_init(); ++ if (unlikely(err)) ++ goto out_loopback; ++ err = au_sysrq_init(); ++ if (unlikely(err)) ++ goto out_hin; ++ err = au_cache_init(); ++ if (unlikely(err)) ++ goto out_sysrq; ++ ++ aufs_fs_type.fs_flags |= au_userns ? FS_USERNS_MOUNT : 0; ++ err = register_filesystem(&aufs_fs_type); ++ if (unlikely(err)) ++ goto out_cache; ++ ++ /* since we define pr_fmt, call printk directly */ ++ printk(KERN_INFO AUFS_NAME " " AUFS_VERSION "\n"); ++ goto out; /* success */ ++ ++out_cache: ++ au_cache_fin(); ++out_sysrq: ++ au_sysrq_fin(); ++out_hin: ++ au_hnotify_fin(); ++out_loopback: ++ au_loopback_fin(); ++out_wkq: ++ au_wkq_fin(); ++out_procfs: ++ au_procfs_fin(); ++out_sysaufs: ++ sysaufs_fin(); ++ au_dy_fin(); ++out: ++ return err; ++} ++ ++static void __exit aufs_exit(void) ++{ ++ unregister_filesystem(&aufs_fs_type); ++ au_cache_fin(); ++ au_sysrq_fin(); ++ au_hnotify_fin(); ++ au_loopback_fin(); ++ au_wkq_fin(); ++ au_procfs_fin(); ++ sysaufs_fin(); ++ au_dy_fin(); ++} ++ ++module_init(aufs_init); ++module_exit(aufs_exit); +diff --git a/fs/aufs/module.h b/fs/aufs/module.h +new file mode 100644 +index 0000000..21344c6 +--- /dev/null ++++ b/fs/aufs/module.h +@@ -0,0 +1,89 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * module initialization and module-global ++ */ ++ ++#ifndef __AUFS_MODULE_H__ ++#define __AUFS_MODULE_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++struct path; ++struct seq_file; ++ ++/* module parameters */ ++extern int sysaufs_brs; ++extern bool au_userns; ++ ++/* ---------------------------------------------------------------------- */ ++ ++extern int au_dir_roflags; ++ ++void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp); ++int au_seq_path(struct seq_file *seq, struct path *path); ++ ++#ifdef CONFIG_PROC_FS ++/* procfs.c */ ++int __init au_procfs_init(void); ++void au_procfs_fin(void); ++#else ++AuStubInt0(au_procfs_init, void); ++AuStubVoid(au_procfs_fin, void); ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* kmem cache */ ++enum { ++ AuCache_DINFO, ++ AuCache_ICNTNR, ++ AuCache_FINFO, ++ AuCache_VDIR, ++ AuCache_DEHSTR, ++ AuCache_HNOTIFY, /* must be last */ ++ AuCache_Last ++}; ++ ++#define AuCacheFlags (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD) ++#define AuCache(type) KMEM_CACHE(type, AuCacheFlags) ++#define AuCacheCtor(type, ctor) \ ++ kmem_cache_create(#type, sizeof(struct type), \ ++ __alignof__(struct type), AuCacheFlags, ctor) ++ ++extern struct kmem_cache *au_cachep[]; ++ ++#define AuCacheFuncs(name, index) \ ++static inline struct au_##name *au_cache_alloc_##name(void) \ ++{ return kmem_cache_alloc(au_cachep[AuCache_##index], GFP_NOFS); } \ ++static inline void au_cache_free_##name(struct au_##name *p) \ ++{ kmem_cache_free(au_cachep[AuCache_##index], p); } ++ ++AuCacheFuncs(dinfo, DINFO); ++AuCacheFuncs(icntnr, ICNTNR); ++AuCacheFuncs(finfo, FINFO); ++AuCacheFuncs(vdir, VDIR); ++AuCacheFuncs(vdir_dehstr, DEHSTR); ++#ifdef CONFIG_AUFS_HNOTIFY ++AuCacheFuncs(hnotify, HNOTIFY); ++#endif ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_MODULE_H__ */ +diff --git a/fs/aufs/mvdown.c b/fs/aufs/mvdown.c +new file mode 100644 +index 0000000..9aff9fc +--- /dev/null ++++ b/fs/aufs/mvdown.c +@@ -0,0 +1,703 @@ ++/* ++ * Copyright (C) 2011-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * move-down, opposite of copy-up ++ */ ++ ++#include "aufs.h" ++ ++struct au_mvd_args { ++ struct { ++ struct super_block *h_sb; ++ struct dentry *h_parent; ++ struct au_hinode *hdir; ++ struct inode *h_dir, *h_inode; ++ struct au_pin pin; ++ } info[AUFS_MVDOWN_NARRAY]; ++ ++ struct aufs_mvdown mvdown; ++ struct dentry *dentry, *parent; ++ struct inode *inode, *dir; ++ struct super_block *sb; ++ aufs_bindex_t bopq, bwh, bfound; ++ unsigned char rename_lock; ++}; ++ ++#define mvd_errno mvdown.au_errno ++#define mvd_bsrc mvdown.stbr[AUFS_MVDOWN_UPPER].bindex ++#define mvd_src_brid mvdown.stbr[AUFS_MVDOWN_UPPER].brid ++#define mvd_bdst mvdown.stbr[AUFS_MVDOWN_LOWER].bindex ++#define mvd_dst_brid mvdown.stbr[AUFS_MVDOWN_LOWER].brid ++ ++#define mvd_h_src_sb info[AUFS_MVDOWN_UPPER].h_sb ++#define mvd_h_src_parent info[AUFS_MVDOWN_UPPER].h_parent ++#define mvd_hdir_src info[AUFS_MVDOWN_UPPER].hdir ++#define mvd_h_src_dir info[AUFS_MVDOWN_UPPER].h_dir ++#define mvd_h_src_inode info[AUFS_MVDOWN_UPPER].h_inode ++#define mvd_pin_src info[AUFS_MVDOWN_UPPER].pin ++ ++#define mvd_h_dst_sb info[AUFS_MVDOWN_LOWER].h_sb ++#define mvd_h_dst_parent info[AUFS_MVDOWN_LOWER].h_parent ++#define mvd_hdir_dst info[AUFS_MVDOWN_LOWER].hdir ++#define mvd_h_dst_dir info[AUFS_MVDOWN_LOWER].h_dir ++#define mvd_h_dst_inode info[AUFS_MVDOWN_LOWER].h_inode ++#define mvd_pin_dst info[AUFS_MVDOWN_LOWER].pin ++ ++#define AU_MVD_PR(flag, ...) do { \ ++ if (flag) \ ++ pr_err(__VA_ARGS__); \ ++ } while (0) ++ ++static int find_lower_writable(struct au_mvd_args *a) ++{ ++ struct super_block *sb; ++ aufs_bindex_t bindex, bbot; ++ struct au_branch *br; ++ ++ sb = a->sb; ++ bindex = a->mvd_bsrc; ++ bbot = au_sbbot(sb); ++ if (a->mvdown.flags & AUFS_MVDOWN_FHSM_LOWER) ++ for (bindex++; bindex <= bbot; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (au_br_fhsm(br->br_perm) ++ && (!(au_br_sb(br)->s_flags & MS_RDONLY))) ++ return bindex; ++ } ++ else if (!(a->mvdown.flags & AUFS_MVDOWN_ROLOWER)) ++ for (bindex++; bindex <= bbot; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (!au_br_rdonly(br)) ++ return bindex; ++ } ++ else ++ for (bindex++; bindex <= bbot; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (!(au_br_sb(br)->s_flags & MS_RDONLY)) { ++ if (au_br_rdonly(br)) ++ a->mvdown.flags ++ |= AUFS_MVDOWN_ROLOWER_R; ++ return bindex; ++ } ++ } ++ ++ return -1; ++} ++ ++/* make the parent dir on bdst */ ++static int au_do_mkdir(const unsigned char dmsg, struct au_mvd_args *a) ++{ ++ int err; ++ ++ err = 0; ++ a->mvd_hdir_src = au_hi(a->dir, a->mvd_bsrc); ++ a->mvd_hdir_dst = au_hi(a->dir, a->mvd_bdst); ++ a->mvd_h_src_parent = au_h_dptr(a->parent, a->mvd_bsrc); ++ a->mvd_h_dst_parent = NULL; ++ if (au_dbbot(a->parent) >= a->mvd_bdst) ++ a->mvd_h_dst_parent = au_h_dptr(a->parent, a->mvd_bdst); ++ if (!a->mvd_h_dst_parent) { ++ err = au_cpdown_dirs(a->dentry, a->mvd_bdst); ++ if (unlikely(err)) { ++ AU_MVD_PR(dmsg, "cpdown_dirs failed\n"); ++ goto out; ++ } ++ a->mvd_h_dst_parent = au_h_dptr(a->parent, a->mvd_bdst); ++ } ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* lock them all */ ++static int au_do_lock(const unsigned char dmsg, struct au_mvd_args *a) ++{ ++ int err; ++ struct dentry *h_trap; ++ ++ a->mvd_h_src_sb = au_sbr_sb(a->sb, a->mvd_bsrc); ++ a->mvd_h_dst_sb = au_sbr_sb(a->sb, a->mvd_bdst); ++ err = au_pin(&a->mvd_pin_dst, a->dentry, a->mvd_bdst, ++ au_opt_udba(a->sb), ++ AuPin_MNT_WRITE | AuPin_DI_LOCKED); ++ AuTraceErr(err); ++ if (unlikely(err)) { ++ AU_MVD_PR(dmsg, "pin_dst failed\n"); ++ goto out; ++ } ++ ++ if (a->mvd_h_src_sb != a->mvd_h_dst_sb) { ++ a->rename_lock = 0; ++ au_pin_init(&a->mvd_pin_src, a->dentry, a->mvd_bsrc, ++ AuLsc_DI_PARENT, AuLsc_I_PARENT3, ++ au_opt_udba(a->sb), ++ AuPin_MNT_WRITE | AuPin_DI_LOCKED); ++ err = au_do_pin(&a->mvd_pin_src); ++ AuTraceErr(err); ++ a->mvd_h_src_dir = d_inode(a->mvd_h_src_parent); ++ if (unlikely(err)) { ++ AU_MVD_PR(dmsg, "pin_src failed\n"); ++ goto out_dst; ++ } ++ goto out; /* success */ ++ } ++ ++ a->rename_lock = 1; ++ au_pin_hdir_unlock(&a->mvd_pin_dst); ++ err = au_pin(&a->mvd_pin_src, a->dentry, a->mvd_bsrc, ++ au_opt_udba(a->sb), ++ AuPin_MNT_WRITE | AuPin_DI_LOCKED); ++ AuTraceErr(err); ++ a->mvd_h_src_dir = d_inode(a->mvd_h_src_parent); ++ if (unlikely(err)) { ++ AU_MVD_PR(dmsg, "pin_src failed\n"); ++ au_pin_hdir_lock(&a->mvd_pin_dst); ++ goto out_dst; ++ } ++ au_pin_hdir_unlock(&a->mvd_pin_src); ++ h_trap = vfsub_lock_rename(a->mvd_h_src_parent, a->mvd_hdir_src, ++ a->mvd_h_dst_parent, a->mvd_hdir_dst); ++ if (h_trap) { ++ err = (h_trap != a->mvd_h_src_parent); ++ if (err) ++ err = (h_trap != a->mvd_h_dst_parent); ++ } ++ BUG_ON(err); /* it should never happen */ ++ if (unlikely(a->mvd_h_src_dir != au_pinned_h_dir(&a->mvd_pin_src))) { ++ err = -EBUSY; ++ AuTraceErr(err); ++ vfsub_unlock_rename(a->mvd_h_src_parent, a->mvd_hdir_src, ++ a->mvd_h_dst_parent, a->mvd_hdir_dst); ++ au_pin_hdir_lock(&a->mvd_pin_src); ++ au_unpin(&a->mvd_pin_src); ++ au_pin_hdir_lock(&a->mvd_pin_dst); ++ goto out_dst; ++ } ++ goto out; /* success */ ++ ++out_dst: ++ au_unpin(&a->mvd_pin_dst); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static void au_do_unlock(const unsigned char dmsg, struct au_mvd_args *a) ++{ ++ if (!a->rename_lock) ++ au_unpin(&a->mvd_pin_src); ++ else { ++ vfsub_unlock_rename(a->mvd_h_src_parent, a->mvd_hdir_src, ++ a->mvd_h_dst_parent, a->mvd_hdir_dst); ++ au_pin_hdir_lock(&a->mvd_pin_src); ++ au_unpin(&a->mvd_pin_src); ++ au_pin_hdir_lock(&a->mvd_pin_dst); ++ } ++ au_unpin(&a->mvd_pin_dst); ++} ++ ++/* copy-down the file */ ++static int au_do_cpdown(const unsigned char dmsg, struct au_mvd_args *a) ++{ ++ int err; ++ struct au_cp_generic cpg = { ++ .dentry = a->dentry, ++ .bdst = a->mvd_bdst, ++ .bsrc = a->mvd_bsrc, ++ .len = -1, ++ .pin = &a->mvd_pin_dst, ++ .flags = AuCpup_DTIME | AuCpup_HOPEN ++ }; ++ ++ AuDbg("b%d, b%d\n", cpg.bsrc, cpg.bdst); ++ if (a->mvdown.flags & AUFS_MVDOWN_OWLOWER) ++ au_fset_cpup(cpg.flags, OVERWRITE); ++ if (a->mvdown.flags & AUFS_MVDOWN_ROLOWER) ++ au_fset_cpup(cpg.flags, RWDST); ++ err = au_sio_cpdown_simple(&cpg); ++ if (unlikely(err)) ++ AU_MVD_PR(dmsg, "cpdown failed\n"); ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ++ * unlink the whiteout on bdst if exist which may be created by UDBA while we ++ * were sleeping ++ */ ++static int au_do_unlink_wh(const unsigned char dmsg, struct au_mvd_args *a) ++{ ++ int err; ++ struct path h_path; ++ struct au_branch *br; ++ struct inode *delegated; ++ ++ br = au_sbr(a->sb, a->mvd_bdst); ++ h_path.dentry = au_wh_lkup(a->mvd_h_dst_parent, &a->dentry->d_name, br); ++ err = PTR_ERR(h_path.dentry); ++ if (IS_ERR(h_path.dentry)) { ++ AU_MVD_PR(dmsg, "wh_lkup failed\n"); ++ goto out; ++ } ++ ++ err = 0; ++ if (d_is_positive(h_path.dentry)) { ++ h_path.mnt = au_br_mnt(br); ++ delegated = NULL; ++ err = vfsub_unlink(d_inode(a->mvd_h_dst_parent), &h_path, ++ &delegated, /*force*/0); ++ if (unlikely(err == -EWOULDBLOCK)) { ++ pr_warn("cannot retry for NFSv4 delegation" ++ " for an internal unlink\n"); ++ iput(delegated); ++ } ++ if (unlikely(err)) ++ AU_MVD_PR(dmsg, "wh_unlink failed\n"); ++ } ++ dput(h_path.dentry); ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ++ * unlink the topmost h_dentry ++ */ ++static int au_do_unlink(const unsigned char dmsg, struct au_mvd_args *a) ++{ ++ int err; ++ struct path h_path; ++ struct inode *delegated; ++ ++ h_path.mnt = au_sbr_mnt(a->sb, a->mvd_bsrc); ++ h_path.dentry = au_h_dptr(a->dentry, a->mvd_bsrc); ++ delegated = NULL; ++ err = vfsub_unlink(a->mvd_h_src_dir, &h_path, &delegated, /*force*/0); ++ if (unlikely(err == -EWOULDBLOCK)) { ++ pr_warn("cannot retry for NFSv4 delegation" ++ " for an internal unlink\n"); ++ iput(delegated); ++ } ++ if (unlikely(err)) ++ AU_MVD_PR(dmsg, "unlink failed\n"); ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++/* Since mvdown succeeded, we ignore an error of this function */ ++static void au_do_stfs(const unsigned char dmsg, struct au_mvd_args *a) ++{ ++ int err; ++ struct au_branch *br; ++ ++ a->mvdown.flags |= AUFS_MVDOWN_STFS_FAILED; ++ br = au_sbr(a->sb, a->mvd_bsrc); ++ err = au_br_stfs(br, &a->mvdown.stbr[AUFS_MVDOWN_UPPER].stfs); ++ if (!err) { ++ br = au_sbr(a->sb, a->mvd_bdst); ++ a->mvdown.stbr[AUFS_MVDOWN_LOWER].brid = br->br_id; ++ err = au_br_stfs(br, &a->mvdown.stbr[AUFS_MVDOWN_LOWER].stfs); ++ } ++ if (!err) ++ a->mvdown.flags &= ~AUFS_MVDOWN_STFS_FAILED; ++ else ++ AU_MVD_PR(dmsg, "statfs failed (%d), ignored\n", err); ++} ++ ++/* ++ * copy-down the file and unlink the bsrc file. ++ * - unlink the bdst whout if exist ++ * - copy-down the file (with whtmp name and rename) ++ * - unlink the bsrc file ++ */ ++static int au_do_mvdown(const unsigned char dmsg, struct au_mvd_args *a) ++{ ++ int err; ++ ++ err = au_do_mkdir(dmsg, a); ++ if (!err) ++ err = au_do_lock(dmsg, a); ++ if (unlikely(err)) ++ goto out; ++ ++ /* ++ * do not revert the activities we made on bdst since they should be ++ * harmless in aufs. ++ */ ++ ++ err = au_do_cpdown(dmsg, a); ++ if (!err) ++ err = au_do_unlink_wh(dmsg, a); ++ if (!err && !(a->mvdown.flags & AUFS_MVDOWN_KUPPER)) ++ err = au_do_unlink(dmsg, a); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ AuDbg("%pd2, 0x%x, %d --> %d\n", ++ a->dentry, a->mvdown.flags, a->mvd_bsrc, a->mvd_bdst); ++ if (find_lower_writable(a) < 0) ++ a->mvdown.flags |= AUFS_MVDOWN_BOTTOM; ++ ++ if (a->mvdown.flags & AUFS_MVDOWN_STFS) ++ au_do_stfs(dmsg, a); ++ ++ /* maintain internal array */ ++ if (!(a->mvdown.flags & AUFS_MVDOWN_KUPPER)) { ++ au_set_h_dptr(a->dentry, a->mvd_bsrc, NULL); ++ au_set_dbtop(a->dentry, a->mvd_bdst); ++ au_set_h_iptr(a->inode, a->mvd_bsrc, NULL, /*flags*/0); ++ au_set_ibtop(a->inode, a->mvd_bdst); ++ } else { ++ /* hide the lower */ ++ au_set_h_dptr(a->dentry, a->mvd_bdst, NULL); ++ au_set_dbbot(a->dentry, a->mvd_bsrc); ++ au_set_h_iptr(a->inode, a->mvd_bdst, NULL, /*flags*/0); ++ au_set_ibbot(a->inode, a->mvd_bsrc); ++ } ++ if (au_dbbot(a->dentry) < a->mvd_bdst) ++ au_set_dbbot(a->dentry, a->mvd_bdst); ++ if (au_ibbot(a->inode) < a->mvd_bdst) ++ au_set_ibbot(a->inode, a->mvd_bdst); ++ ++out_unlock: ++ au_do_unlock(dmsg, a); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* make sure the file is idle */ ++static int au_mvd_args_busy(const unsigned char dmsg, struct au_mvd_args *a) ++{ ++ int err, plinked; ++ ++ err = 0; ++ plinked = !!au_opt_test(au_mntflags(a->sb), PLINK); ++ if (au_dbtop(a->dentry) == a->mvd_bsrc ++ && au_dcount(a->dentry) == 1 ++ && atomic_read(&a->inode->i_count) == 1 ++ /* && a->mvd_h_src_inode->i_nlink == 1 */ ++ && (!plinked || !au_plink_test(a->inode)) ++ && a->inode->i_nlink == 1) ++ goto out; ++ ++ err = -EBUSY; ++ AU_MVD_PR(dmsg, ++ "b%d, d{b%d, c%d?}, i{c%d?, l%u}, hi{l%u}, p{%d, %d}\n", ++ a->mvd_bsrc, au_dbtop(a->dentry), au_dcount(a->dentry), ++ atomic_read(&a->inode->i_count), a->inode->i_nlink, ++ a->mvd_h_src_inode->i_nlink, ++ plinked, plinked ? au_plink_test(a->inode) : 0); ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* make sure the parent dir is fine */ ++static int au_mvd_args_parent(const unsigned char dmsg, ++ struct au_mvd_args *a) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ ++ err = 0; ++ if (unlikely(au_alive_dir(a->parent))) { ++ err = -ENOENT; ++ AU_MVD_PR(dmsg, "parent dir is dead\n"); ++ goto out; ++ } ++ ++ a->bopq = au_dbdiropq(a->parent); ++ bindex = au_wbr_nonopq(a->dentry, a->mvd_bdst); ++ AuDbg("b%d\n", bindex); ++ if (unlikely((bindex >= 0 && bindex < a->mvd_bdst) ++ || (a->bopq != -1 && a->bopq < a->mvd_bdst))) { ++ err = -EINVAL; ++ a->mvd_errno = EAU_MVDOWN_OPAQUE; ++ AU_MVD_PR(dmsg, "ancestor is opaque b%d, b%d\n", ++ a->bopq, a->mvd_bdst); ++ } ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int au_mvd_args_intermediate(const unsigned char dmsg, ++ struct au_mvd_args *a) ++{ ++ int err; ++ struct au_dinfo *dinfo, *tmp; ++ ++ /* lookup the next lower positive entry */ ++ err = -ENOMEM; ++ tmp = au_di_alloc(a->sb, AuLsc_DI_TMP); ++ if (unlikely(!tmp)) ++ goto out; ++ ++ a->bfound = -1; ++ a->bwh = -1; ++ dinfo = au_di(a->dentry); ++ au_di_cp(tmp, dinfo); ++ au_di_swap(tmp, dinfo); ++ ++ /* returns the number of positive dentries */ ++ err = au_lkup_dentry(a->dentry, a->mvd_bsrc + 1, /*type*/0); ++ if (!err) ++ a->bwh = au_dbwh(a->dentry); ++ else if (err > 0) ++ a->bfound = au_dbtop(a->dentry); ++ ++ au_di_swap(tmp, dinfo); ++ au_rw_write_unlock(&tmp->di_rwsem); ++ au_di_free(tmp); ++ if (unlikely(err < 0)) ++ AU_MVD_PR(dmsg, "failed look-up lower\n"); ++ ++ /* ++ * here, we have these cases. ++ * bfound == -1 ++ * no positive dentry under bsrc. there are more sub-cases. ++ * bwh < 0 ++ * there no whiteout, we can safely move-down. ++ * bwh <= bsrc ++ * impossible ++ * bsrc < bwh && bwh < bdst ++ * there is a whiteout on RO branch. cannot proceed. ++ * bwh == bdst ++ * there is a whiteout on the RW target branch. it should ++ * be removed. ++ * bdst < bwh ++ * there is a whiteout somewhere unrelated branch. ++ * -1 < bfound && bfound <= bsrc ++ * impossible. ++ * bfound < bdst ++ * found, but it is on RO branch between bsrc and bdst. cannot ++ * proceed. ++ * bfound == bdst ++ * found, replace it if AUFS_MVDOWN_FORCE is set. otherwise return ++ * error. ++ * bdst < bfound ++ * found, after we create the file on bdst, it will be hidden. ++ */ ++ ++ AuDebugOn(a->bfound == -1 ++ && a->bwh != -1 ++ && a->bwh <= a->mvd_bsrc); ++ AuDebugOn(-1 < a->bfound ++ && a->bfound <= a->mvd_bsrc); ++ ++ err = -EINVAL; ++ if (a->bfound == -1 ++ && a->mvd_bsrc < a->bwh ++ && a->bwh != -1 ++ && a->bwh < a->mvd_bdst) { ++ a->mvd_errno = EAU_MVDOWN_WHITEOUT; ++ AU_MVD_PR(dmsg, "bsrc %d, bdst %d, bfound %d, bwh %d\n", ++ a->mvd_bsrc, a->mvd_bdst, a->bfound, a->bwh); ++ goto out; ++ } else if (a->bfound != -1 && a->bfound < a->mvd_bdst) { ++ a->mvd_errno = EAU_MVDOWN_UPPER; ++ AU_MVD_PR(dmsg, "bdst %d, bfound %d\n", ++ a->mvd_bdst, a->bfound); ++ goto out; ++ } ++ ++ err = 0; /* success */ ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int au_mvd_args_exist(const unsigned char dmsg, struct au_mvd_args *a) ++{ ++ int err; ++ ++ err = 0; ++ if (!(a->mvdown.flags & AUFS_MVDOWN_OWLOWER) ++ && a->bfound == a->mvd_bdst) ++ err = -EEXIST; ++ AuTraceErr(err); ++ return err; ++} ++ ++static int au_mvd_args(const unsigned char dmsg, struct au_mvd_args *a) ++{ ++ int err; ++ struct au_branch *br; ++ ++ err = -EISDIR; ++ if (unlikely(S_ISDIR(a->inode->i_mode))) ++ goto out; ++ ++ err = -EINVAL; ++ if (!(a->mvdown.flags & AUFS_MVDOWN_BRID_UPPER)) ++ a->mvd_bsrc = au_ibtop(a->inode); ++ else { ++ a->mvd_bsrc = au_br_index(a->sb, a->mvd_src_brid); ++ if (unlikely(a->mvd_bsrc < 0 ++ || (a->mvd_bsrc < au_dbtop(a->dentry) ++ || au_dbbot(a->dentry) < a->mvd_bsrc ++ || !au_h_dptr(a->dentry, a->mvd_bsrc)) ++ || (a->mvd_bsrc < au_ibtop(a->inode) ++ || au_ibbot(a->inode) < a->mvd_bsrc ++ || !au_h_iptr(a->inode, a->mvd_bsrc)))) { ++ a->mvd_errno = EAU_MVDOWN_NOUPPER; ++ AU_MVD_PR(dmsg, "no upper\n"); ++ goto out; ++ } ++ } ++ if (unlikely(a->mvd_bsrc == au_sbbot(a->sb))) { ++ a->mvd_errno = EAU_MVDOWN_BOTTOM; ++ AU_MVD_PR(dmsg, "on the bottom\n"); ++ goto out; ++ } ++ a->mvd_h_src_inode = au_h_iptr(a->inode, a->mvd_bsrc); ++ br = au_sbr(a->sb, a->mvd_bsrc); ++ err = au_br_rdonly(br); ++ if (!(a->mvdown.flags & AUFS_MVDOWN_ROUPPER)) { ++ if (unlikely(err)) ++ goto out; ++ } else if (!(vfsub_native_ro(a->mvd_h_src_inode) ++ || IS_APPEND(a->mvd_h_src_inode))) { ++ if (err) ++ a->mvdown.flags |= AUFS_MVDOWN_ROUPPER_R; ++ /* go on */ ++ } else ++ goto out; ++ ++ err = -EINVAL; ++ if (!(a->mvdown.flags & AUFS_MVDOWN_BRID_LOWER)) { ++ a->mvd_bdst = find_lower_writable(a); ++ if (unlikely(a->mvd_bdst < 0)) { ++ a->mvd_errno = EAU_MVDOWN_BOTTOM; ++ AU_MVD_PR(dmsg, "no writable lower branch\n"); ++ goto out; ++ } ++ } else { ++ a->mvd_bdst = au_br_index(a->sb, a->mvd_dst_brid); ++ if (unlikely(a->mvd_bdst < 0 ++ || au_sbbot(a->sb) < a->mvd_bdst)) { ++ a->mvd_errno = EAU_MVDOWN_NOLOWERBR; ++ AU_MVD_PR(dmsg, "no lower brid\n"); ++ goto out; ++ } ++ } ++ ++ err = au_mvd_args_busy(dmsg, a); ++ if (!err) ++ err = au_mvd_args_parent(dmsg, a); ++ if (!err) ++ err = au_mvd_args_intermediate(dmsg, a); ++ if (!err) ++ err = au_mvd_args_exist(dmsg, a); ++ if (!err) ++ AuDbg("b%d, b%d\n", a->mvd_bsrc, a->mvd_bdst); ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++int au_mvdown(struct dentry *dentry, struct aufs_mvdown __user *uarg) ++{ ++ int err, e; ++ unsigned char dmsg; ++ struct au_mvd_args *args; ++ struct inode *inode; ++ ++ inode = d_inode(dentry); ++ err = -EPERM; ++ if (unlikely(!capable(CAP_SYS_ADMIN))) ++ goto out; ++ ++ err = -ENOMEM; ++ args = kmalloc(sizeof(*args), GFP_NOFS); ++ if (unlikely(!args)) ++ goto out; ++ ++ err = copy_from_user(&args->mvdown, uarg, sizeof(args->mvdown)); ++ if (!err) ++ err = !access_ok(VERIFY_WRITE, uarg, sizeof(*uarg)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ goto out_free; ++ } ++ AuDbg("flags 0x%x\n", args->mvdown.flags); ++ args->mvdown.flags &= ~(AUFS_MVDOWN_ROLOWER_R | AUFS_MVDOWN_ROUPPER_R); ++ args->mvdown.au_errno = 0; ++ args->dentry = dentry; ++ args->inode = inode; ++ args->sb = dentry->d_sb; ++ ++ err = -ENOENT; ++ dmsg = !!(args->mvdown.flags & AUFS_MVDOWN_DMSG); ++ args->parent = dget_parent(dentry); ++ args->dir = d_inode(args->parent); ++ inode_lock_nested(args->dir, I_MUTEX_PARENT); ++ dput(args->parent); ++ if (unlikely(args->parent != dentry->d_parent)) { ++ AU_MVD_PR(dmsg, "parent dir is moved\n"); ++ goto out_dir; ++ } ++ ++ inode_lock_nested(inode, I_MUTEX_CHILD); ++ err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_NOPLMW); ++ if (unlikely(err)) ++ goto out_inode; ++ ++ di_write_lock_parent(args->parent); ++ err = au_mvd_args(dmsg, args); ++ if (unlikely(err)) ++ goto out_parent; ++ ++ err = au_do_mvdown(dmsg, args); ++ if (unlikely(err)) ++ goto out_parent; ++ ++ au_cpup_attr_timesizes(args->dir); ++ au_cpup_attr_timesizes(inode); ++ if (!(args->mvdown.flags & AUFS_MVDOWN_KUPPER)) ++ au_cpup_igen(inode, au_h_iptr(inode, args->mvd_bdst)); ++ /* au_digen_dec(dentry); */ ++ ++out_parent: ++ di_write_unlock(args->parent); ++ aufs_read_unlock(dentry, AuLock_DW); ++out_inode: ++ inode_unlock(inode); ++out_dir: ++ inode_unlock(args->dir); ++out_free: ++ e = copy_to_user(uarg, &args->mvdown, sizeof(args->mvdown)); ++ if (unlikely(e)) ++ err = -EFAULT; ++ kfree(args); ++out: ++ AuTraceErr(err); ++ return err; ++} +diff --git a/fs/aufs/opts.c b/fs/aufs/opts.c +new file mode 100644 +index 0000000..6d53b2e +--- /dev/null ++++ b/fs/aufs/opts.c +@@ -0,0 +1,1859 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * mount options/flags ++ */ ++ ++#include ++#include /* a distribution requires */ ++#include ++#include "aufs.h" ++ ++/* ---------------------------------------------------------------------- */ ++ ++enum { ++ Opt_br, ++ Opt_add, Opt_del, Opt_mod, Opt_append, Opt_prepend, ++ Opt_idel, Opt_imod, ++ Opt_dirwh, Opt_rdcache, Opt_rdblk, Opt_rdhash, ++ Opt_rdblk_def, Opt_rdhash_def, ++ Opt_xino, Opt_noxino, ++ Opt_trunc_xino, Opt_trunc_xino_v, Opt_notrunc_xino, ++ Opt_trunc_xino_path, Opt_itrunc_xino, ++ Opt_trunc_xib, Opt_notrunc_xib, ++ Opt_shwh, Opt_noshwh, ++ Opt_plink, Opt_noplink, Opt_list_plink, ++ Opt_udba, ++ Opt_dio, Opt_nodio, ++ Opt_diropq_a, Opt_diropq_w, ++ Opt_warn_perm, Opt_nowarn_perm, ++ Opt_wbr_copyup, Opt_wbr_create, ++ Opt_fhsm_sec, ++ Opt_verbose, Opt_noverbose, ++ Opt_sum, Opt_nosum, Opt_wsum, ++ Opt_dirperm1, Opt_nodirperm1, ++ Opt_acl, Opt_noacl, ++ Opt_tail, Opt_ignore, Opt_ignore_silent, Opt_err ++}; ++ ++static match_table_t options = { ++ {Opt_br, "br=%s"}, ++ {Opt_br, "br:%s"}, ++ ++ {Opt_add, "add=%d:%s"}, ++ {Opt_add, "add:%d:%s"}, ++ {Opt_add, "ins=%d:%s"}, ++ {Opt_add, "ins:%d:%s"}, ++ {Opt_append, "append=%s"}, ++ {Opt_append, "append:%s"}, ++ {Opt_prepend, "prepend=%s"}, ++ {Opt_prepend, "prepend:%s"}, ++ ++ {Opt_del, "del=%s"}, ++ {Opt_del, "del:%s"}, ++ /* {Opt_idel, "idel:%d"}, */ ++ {Opt_mod, "mod=%s"}, ++ {Opt_mod, "mod:%s"}, ++ /* {Opt_imod, "imod:%d:%s"}, */ ++ ++ {Opt_dirwh, "dirwh=%d"}, ++ ++ {Opt_xino, "xino=%s"}, ++ {Opt_noxino, "noxino"}, ++ {Opt_trunc_xino, "trunc_xino"}, ++ {Opt_trunc_xino_v, "trunc_xino_v=%d:%d"}, ++ {Opt_notrunc_xino, "notrunc_xino"}, ++ {Opt_trunc_xino_path, "trunc_xino=%s"}, ++ {Opt_itrunc_xino, "itrunc_xino=%d"}, ++ /* {Opt_zxino, "zxino=%s"}, */ ++ {Opt_trunc_xib, "trunc_xib"}, ++ {Opt_notrunc_xib, "notrunc_xib"}, ++ ++#ifdef CONFIG_PROC_FS ++ {Opt_plink, "plink"}, ++#else ++ {Opt_ignore_silent, "plink"}, ++#endif ++ ++ {Opt_noplink, "noplink"}, ++ ++#ifdef CONFIG_AUFS_DEBUG ++ {Opt_list_plink, "list_plink"}, ++#endif ++ ++ {Opt_udba, "udba=%s"}, ++ ++ {Opt_dio, "dio"}, ++ {Opt_nodio, "nodio"}, ++ ++#ifdef CONFIG_AUFS_FHSM ++ {Opt_fhsm_sec, "fhsm_sec=%d"}, ++#else ++ {Opt_ignore_silent, "fhsm_sec=%d"}, ++#endif ++ ++ {Opt_diropq_a, "diropq=always"}, ++ {Opt_diropq_a, "diropq=a"}, ++ {Opt_diropq_w, "diropq=whiteouted"}, ++ {Opt_diropq_w, "diropq=w"}, ++ ++ {Opt_warn_perm, "warn_perm"}, ++ {Opt_nowarn_perm, "nowarn_perm"}, ++ ++ /* keep them temporary */ ++ {Opt_ignore_silent, "nodlgt"}, ++ {Opt_ignore_silent, "clean_plink"}, ++ ++#ifdef CONFIG_AUFS_SHWH ++ {Opt_shwh, "shwh"}, ++#endif ++ {Opt_noshwh, "noshwh"}, ++ ++ {Opt_dirperm1, "dirperm1"}, ++ {Opt_nodirperm1, "nodirperm1"}, ++ ++ {Opt_verbose, "verbose"}, ++ {Opt_verbose, "v"}, ++ {Opt_noverbose, "noverbose"}, ++ {Opt_noverbose, "quiet"}, ++ {Opt_noverbose, "q"}, ++ {Opt_noverbose, "silent"}, ++ ++ {Opt_sum, "sum"}, ++ {Opt_nosum, "nosum"}, ++ {Opt_wsum, "wsum"}, ++ ++ {Opt_rdcache, "rdcache=%d"}, ++ {Opt_rdblk, "rdblk=%d"}, ++ {Opt_rdblk_def, "rdblk=def"}, ++ {Opt_rdhash, "rdhash=%d"}, ++ {Opt_rdhash_def, "rdhash=def"}, ++ ++ {Opt_wbr_create, "create=%s"}, ++ {Opt_wbr_create, "create_policy=%s"}, ++ {Opt_wbr_copyup, "cpup=%s"}, ++ {Opt_wbr_copyup, "copyup=%s"}, ++ {Opt_wbr_copyup, "copyup_policy=%s"}, ++ ++ /* generic VFS flag */ ++#ifdef CONFIG_FS_POSIX_ACL ++ {Opt_acl, "acl"}, ++ {Opt_noacl, "noacl"}, ++#else ++ {Opt_ignore_silent, "acl"}, ++ {Opt_ignore_silent, "noacl"}, ++#endif ++ ++ /* internal use for the scripts */ ++ {Opt_ignore_silent, "si=%s"}, ++ ++ {Opt_br, "dirs=%s"}, ++ {Opt_ignore, "debug=%d"}, ++ {Opt_ignore, "delete=whiteout"}, ++ {Opt_ignore, "delete=all"}, ++ {Opt_ignore, "imap=%s"}, ++ ++ /* temporary workaround, due to old mount(8)? */ ++ {Opt_ignore_silent, "relatime"}, ++ ++ {Opt_err, NULL} ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static const char *au_parser_pattern(int val, match_table_t tbl) ++{ ++ struct match_token *p; ++ ++ p = tbl; ++ while (p->pattern) { ++ if (p->token == val) ++ return p->pattern; ++ p++; ++ } ++ BUG(); ++ return "??"; ++} ++ ++static const char *au_optstr(int *val, match_table_t tbl) ++{ ++ struct match_token *p; ++ int v; ++ ++ v = *val; ++ if (!v) ++ goto out; ++ p = tbl; ++ while (p->pattern) { ++ if (p->token ++ && (v & p->token) == p->token) { ++ *val &= ~p->token; ++ return p->pattern; ++ } ++ p++; ++ } ++ ++out: ++ return NULL; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static match_table_t brperm = { ++ {AuBrPerm_RO, AUFS_BRPERM_RO}, ++ {AuBrPerm_RR, AUFS_BRPERM_RR}, ++ {AuBrPerm_RW, AUFS_BRPERM_RW}, ++ {0, NULL} ++}; ++ ++static match_table_t brattr = { ++ /* general */ ++ {AuBrAttr_COO_REG, AUFS_BRATTR_COO_REG}, ++ {AuBrAttr_COO_ALL, AUFS_BRATTR_COO_ALL}, ++ /* 'unpin' attrib is meaningless since linux-3.18-rc1 */ ++ {AuBrAttr_UNPIN, AUFS_BRATTR_UNPIN}, ++#ifdef CONFIG_AUFS_FHSM ++ {AuBrAttr_FHSM, AUFS_BRATTR_FHSM}, ++#endif ++#ifdef CONFIG_AUFS_XATTR ++ {AuBrAttr_ICEX, AUFS_BRATTR_ICEX}, ++ {AuBrAttr_ICEX_SEC, AUFS_BRATTR_ICEX_SEC}, ++ {AuBrAttr_ICEX_SYS, AUFS_BRATTR_ICEX_SYS}, ++ {AuBrAttr_ICEX_TR, AUFS_BRATTR_ICEX_TR}, ++ {AuBrAttr_ICEX_USR, AUFS_BRATTR_ICEX_USR}, ++ {AuBrAttr_ICEX_OTH, AUFS_BRATTR_ICEX_OTH}, ++#endif ++ ++ /* ro/rr branch */ ++ {AuBrRAttr_WH, AUFS_BRRATTR_WH}, ++ ++ /* rw branch */ ++ {AuBrWAttr_MOO, AUFS_BRWATTR_MOO}, ++ {AuBrWAttr_NoLinkWH, AUFS_BRWATTR_NLWH}, ++ ++ {0, NULL} ++}; ++ ++static int br_attr_val(char *str, match_table_t table, substring_t args[]) ++{ ++ int attr, v; ++ char *p; ++ ++ attr = 0; ++ do { ++ p = strchr(str, '+'); ++ if (p) ++ *p = 0; ++ v = match_token(str, table, args); ++ if (v) { ++ if (v & AuBrAttr_CMOO_Mask) ++ attr &= ~AuBrAttr_CMOO_Mask; ++ attr |= v; ++ } else { ++ if (p) ++ *p = '+'; ++ pr_warn("ignored branch attribute %s\n", str); ++ break; ++ } ++ if (p) ++ str = p + 1; ++ } while (p); ++ ++ return attr; ++} ++ ++static int au_do_optstr_br_attr(au_br_perm_str_t *str, int perm) ++{ ++ int sz; ++ const char *p; ++ char *q; ++ ++ q = str->a; ++ *q = 0; ++ p = au_optstr(&perm, brattr); ++ if (p) { ++ sz = strlen(p); ++ memcpy(q, p, sz + 1); ++ q += sz; ++ } else ++ goto out; ++ ++ do { ++ p = au_optstr(&perm, brattr); ++ if (p) { ++ *q++ = '+'; ++ sz = strlen(p); ++ memcpy(q, p, sz + 1); ++ q += sz; ++ } ++ } while (p); ++ ++out: ++ return q - str->a; ++} ++ ++static int noinline_for_stack br_perm_val(char *perm) ++{ ++ int val, bad, sz; ++ char *p; ++ substring_t args[MAX_OPT_ARGS]; ++ au_br_perm_str_t attr; ++ ++ p = strchr(perm, '+'); ++ if (p) ++ *p = 0; ++ val = match_token(perm, brperm, args); ++ if (!val) { ++ if (p) ++ *p = '+'; ++ pr_warn("ignored branch permission %s\n", perm); ++ val = AuBrPerm_RO; ++ goto out; ++ } ++ if (!p) ++ goto out; ++ ++ val |= br_attr_val(p + 1, brattr, args); ++ ++ bad = 0; ++ switch (val & AuBrPerm_Mask) { ++ case AuBrPerm_RO: ++ case AuBrPerm_RR: ++ bad = val & AuBrWAttr_Mask; ++ val &= ~AuBrWAttr_Mask; ++ break; ++ case AuBrPerm_RW: ++ bad = val & AuBrRAttr_Mask; ++ val &= ~AuBrRAttr_Mask; ++ break; ++ } ++ ++ /* ++ * 'unpin' attrib becomes meaningless since linux-3.18-rc1, but aufs ++ * does not treat it as an error, just warning. ++ * this is a tiny guard for the user operation. ++ */ ++ if (val & AuBrAttr_UNPIN) { ++ bad |= AuBrAttr_UNPIN; ++ val &= ~AuBrAttr_UNPIN; ++ } ++ ++ if (unlikely(bad)) { ++ sz = au_do_optstr_br_attr(&attr, bad); ++ AuDebugOn(!sz); ++ pr_warn("ignored branch attribute %s\n", attr.a); ++ } ++ ++out: ++ return val; ++} ++ ++void au_optstr_br_perm(au_br_perm_str_t *str, int perm) ++{ ++ au_br_perm_str_t attr; ++ const char *p; ++ char *q; ++ int sz; ++ ++ q = str->a; ++ p = au_optstr(&perm, brperm); ++ AuDebugOn(!p || !*p); ++ sz = strlen(p); ++ memcpy(q, p, sz + 1); ++ q += sz; ++ ++ sz = au_do_optstr_br_attr(&attr, perm); ++ if (sz) { ++ *q++ = '+'; ++ memcpy(q, attr.a, sz + 1); ++ } ++ ++ AuDebugOn(strlen(str->a) >= sizeof(str->a)); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static match_table_t udbalevel = { ++ {AuOpt_UDBA_REVAL, "reval"}, ++ {AuOpt_UDBA_NONE, "none"}, ++#ifdef CONFIG_AUFS_HNOTIFY ++ {AuOpt_UDBA_HNOTIFY, "notify"}, /* abstraction */ ++#ifdef CONFIG_AUFS_HFSNOTIFY ++ {AuOpt_UDBA_HNOTIFY, "fsnotify"}, ++#endif ++#endif ++ {-1, NULL} ++}; ++ ++static int noinline_for_stack udba_val(char *str) ++{ ++ substring_t args[MAX_OPT_ARGS]; ++ ++ return match_token(str, udbalevel, args); ++} ++ ++const char *au_optstr_udba(int udba) ++{ ++ return au_parser_pattern(udba, udbalevel); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static match_table_t au_wbr_create_policy = { ++ {AuWbrCreate_TDP, "tdp"}, ++ {AuWbrCreate_TDP, "top-down-parent"}, ++ {AuWbrCreate_RR, "rr"}, ++ {AuWbrCreate_RR, "round-robin"}, ++ {AuWbrCreate_MFS, "mfs"}, ++ {AuWbrCreate_MFS, "most-free-space"}, ++ {AuWbrCreate_MFSV, "mfs:%d"}, ++ {AuWbrCreate_MFSV, "most-free-space:%d"}, ++ ++ {AuWbrCreate_MFSRR, "mfsrr:%d"}, ++ {AuWbrCreate_MFSRRV, "mfsrr:%d:%d"}, ++ {AuWbrCreate_PMFS, "pmfs"}, ++ {AuWbrCreate_PMFSV, "pmfs:%d"}, ++ {AuWbrCreate_PMFSRR, "pmfsrr:%d"}, ++ {AuWbrCreate_PMFSRRV, "pmfsrr:%d:%d"}, ++ ++ {-1, NULL} ++}; ++ ++/* ++ * cf. linux/lib/parser.c and cmdline.c ++ * gave up calling memparse() since it uses simple_strtoull() instead of ++ * kstrto...(). ++ */ ++static int noinline_for_stack ++au_match_ull(substring_t *s, unsigned long long *result) ++{ ++ int err; ++ unsigned int len; ++ char a[32]; ++ ++ err = -ERANGE; ++ len = s->to - s->from; ++ if (len + 1 <= sizeof(a)) { ++ memcpy(a, s->from, len); ++ a[len] = '\0'; ++ err = kstrtoull(a, 0, result); ++ } ++ return err; ++} ++ ++static int au_wbr_mfs_wmark(substring_t *arg, char *str, ++ struct au_opt_wbr_create *create) ++{ ++ int err; ++ unsigned long long ull; ++ ++ err = 0; ++ if (!au_match_ull(arg, &ull)) ++ create->mfsrr_watermark = ull; ++ else { ++ pr_err("bad integer in %s\n", str); ++ err = -EINVAL; ++ } ++ ++ return err; ++} ++ ++static int au_wbr_mfs_sec(substring_t *arg, char *str, ++ struct au_opt_wbr_create *create) ++{ ++ int n, err; ++ ++ err = 0; ++ if (!match_int(arg, &n) && 0 <= n && n <= AUFS_MFS_MAX_SEC) ++ create->mfs_second = n; ++ else { ++ pr_err("bad integer in %s\n", str); ++ err = -EINVAL; ++ } ++ ++ return err; ++} ++ ++static int noinline_for_stack ++au_wbr_create_val(char *str, struct au_opt_wbr_create *create) ++{ ++ int err, e; ++ substring_t args[MAX_OPT_ARGS]; ++ ++ err = match_token(str, au_wbr_create_policy, args); ++ create->wbr_create = err; ++ switch (err) { ++ case AuWbrCreate_MFSRRV: ++ case AuWbrCreate_PMFSRRV: ++ e = au_wbr_mfs_wmark(&args[0], str, create); ++ if (!e) ++ e = au_wbr_mfs_sec(&args[1], str, create); ++ if (unlikely(e)) ++ err = e; ++ break; ++ case AuWbrCreate_MFSRR: ++ case AuWbrCreate_PMFSRR: ++ e = au_wbr_mfs_wmark(&args[0], str, create); ++ if (unlikely(e)) { ++ err = e; ++ break; ++ } ++ /*FALLTHROUGH*/ ++ case AuWbrCreate_MFS: ++ case AuWbrCreate_PMFS: ++ create->mfs_second = AUFS_MFS_DEF_SEC; ++ break; ++ case AuWbrCreate_MFSV: ++ case AuWbrCreate_PMFSV: ++ e = au_wbr_mfs_sec(&args[0], str, create); ++ if (unlikely(e)) ++ err = e; ++ break; ++ } ++ ++ return err; ++} ++ ++const char *au_optstr_wbr_create(int wbr_create) ++{ ++ return au_parser_pattern(wbr_create, au_wbr_create_policy); ++} ++ ++static match_table_t au_wbr_copyup_policy = { ++ {AuWbrCopyup_TDP, "tdp"}, ++ {AuWbrCopyup_TDP, "top-down-parent"}, ++ {AuWbrCopyup_BUP, "bup"}, ++ {AuWbrCopyup_BUP, "bottom-up-parent"}, ++ {AuWbrCopyup_BU, "bu"}, ++ {AuWbrCopyup_BU, "bottom-up"}, ++ {-1, NULL} ++}; ++ ++static int noinline_for_stack au_wbr_copyup_val(char *str) ++{ ++ substring_t args[MAX_OPT_ARGS]; ++ ++ return match_token(str, au_wbr_copyup_policy, args); ++} ++ ++const char *au_optstr_wbr_copyup(int wbr_copyup) ++{ ++ return au_parser_pattern(wbr_copyup, au_wbr_copyup_policy); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static const int lkup_dirflags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; ++ ++static void dump_opts(struct au_opts *opts) ++{ ++#ifdef CONFIG_AUFS_DEBUG ++ /* reduce stack space */ ++ union { ++ struct au_opt_add *add; ++ struct au_opt_del *del; ++ struct au_opt_mod *mod; ++ struct au_opt_xino *xino; ++ struct au_opt_xino_itrunc *xino_itrunc; ++ struct au_opt_wbr_create *create; ++ } u; ++ struct au_opt *opt; ++ ++ opt = opts->opt; ++ while (opt->type != Opt_tail) { ++ switch (opt->type) { ++ case Opt_add: ++ u.add = &opt->add; ++ AuDbg("add {b%d, %s, 0x%x, %p}\n", ++ u.add->bindex, u.add->pathname, u.add->perm, ++ u.add->path.dentry); ++ break; ++ case Opt_del: ++ case Opt_idel: ++ u.del = &opt->del; ++ AuDbg("del {%s, %p}\n", ++ u.del->pathname, u.del->h_path.dentry); ++ break; ++ case Opt_mod: ++ case Opt_imod: ++ u.mod = &opt->mod; ++ AuDbg("mod {%s, 0x%x, %p}\n", ++ u.mod->path, u.mod->perm, u.mod->h_root); ++ break; ++ case Opt_append: ++ u.add = &opt->add; ++ AuDbg("append {b%d, %s, 0x%x, %p}\n", ++ u.add->bindex, u.add->pathname, u.add->perm, ++ u.add->path.dentry); ++ break; ++ case Opt_prepend: ++ u.add = &opt->add; ++ AuDbg("prepend {b%d, %s, 0x%x, %p}\n", ++ u.add->bindex, u.add->pathname, u.add->perm, ++ u.add->path.dentry); ++ break; ++ case Opt_dirwh: ++ AuDbg("dirwh %d\n", opt->dirwh); ++ break; ++ case Opt_rdcache: ++ AuDbg("rdcache %d\n", opt->rdcache); ++ break; ++ case Opt_rdblk: ++ AuDbg("rdblk %u\n", opt->rdblk); ++ break; ++ case Opt_rdblk_def: ++ AuDbg("rdblk_def\n"); ++ break; ++ case Opt_rdhash: ++ AuDbg("rdhash %u\n", opt->rdhash); ++ break; ++ case Opt_rdhash_def: ++ AuDbg("rdhash_def\n"); ++ break; ++ case Opt_xino: ++ u.xino = &opt->xino; ++ AuDbg("xino {%s %pD}\n", u.xino->path, u.xino->file); ++ break; ++ case Opt_trunc_xino: ++ AuLabel(trunc_xino); ++ break; ++ case Opt_notrunc_xino: ++ AuLabel(notrunc_xino); ++ break; ++ case Opt_trunc_xino_path: ++ case Opt_itrunc_xino: ++ u.xino_itrunc = &opt->xino_itrunc; ++ AuDbg("trunc_xino %d\n", u.xino_itrunc->bindex); ++ break; ++ case Opt_noxino: ++ AuLabel(noxino); ++ break; ++ case Opt_trunc_xib: ++ AuLabel(trunc_xib); ++ break; ++ case Opt_notrunc_xib: ++ AuLabel(notrunc_xib); ++ break; ++ case Opt_shwh: ++ AuLabel(shwh); ++ break; ++ case Opt_noshwh: ++ AuLabel(noshwh); ++ break; ++ case Opt_dirperm1: ++ AuLabel(dirperm1); ++ break; ++ case Opt_nodirperm1: ++ AuLabel(nodirperm1); ++ break; ++ case Opt_plink: ++ AuLabel(plink); ++ break; ++ case Opt_noplink: ++ AuLabel(noplink); ++ break; ++ case Opt_list_plink: ++ AuLabel(list_plink); ++ break; ++ case Opt_udba: ++ AuDbg("udba %d, %s\n", ++ opt->udba, au_optstr_udba(opt->udba)); ++ break; ++ case Opt_dio: ++ AuLabel(dio); ++ break; ++ case Opt_nodio: ++ AuLabel(nodio); ++ break; ++ case Opt_diropq_a: ++ AuLabel(diropq_a); ++ break; ++ case Opt_diropq_w: ++ AuLabel(diropq_w); ++ break; ++ case Opt_warn_perm: ++ AuLabel(warn_perm); ++ break; ++ case Opt_nowarn_perm: ++ AuLabel(nowarn_perm); ++ break; ++ case Opt_verbose: ++ AuLabel(verbose); ++ break; ++ case Opt_noverbose: ++ AuLabel(noverbose); ++ break; ++ case Opt_sum: ++ AuLabel(sum); ++ break; ++ case Opt_nosum: ++ AuLabel(nosum); ++ break; ++ case Opt_wsum: ++ AuLabel(wsum); ++ break; ++ case Opt_wbr_create: ++ u.create = &opt->wbr_create; ++ AuDbg("create %d, %s\n", u.create->wbr_create, ++ au_optstr_wbr_create(u.create->wbr_create)); ++ switch (u.create->wbr_create) { ++ case AuWbrCreate_MFSV: ++ case AuWbrCreate_PMFSV: ++ AuDbg("%d sec\n", u.create->mfs_second); ++ break; ++ case AuWbrCreate_MFSRR: ++ AuDbg("%llu watermark\n", ++ u.create->mfsrr_watermark); ++ break; ++ case AuWbrCreate_MFSRRV: ++ case AuWbrCreate_PMFSRRV: ++ AuDbg("%llu watermark, %d sec\n", ++ u.create->mfsrr_watermark, ++ u.create->mfs_second); ++ break; ++ } ++ break; ++ case Opt_wbr_copyup: ++ AuDbg("copyup %d, %s\n", opt->wbr_copyup, ++ au_optstr_wbr_copyup(opt->wbr_copyup)); ++ break; ++ case Opt_fhsm_sec: ++ AuDbg("fhsm_sec %u\n", opt->fhsm_second); ++ break; ++ case Opt_acl: ++ AuLabel(acl); ++ break; ++ case Opt_noacl: ++ AuLabel(noacl); ++ break; ++ default: ++ BUG(); ++ } ++ opt++; ++ } ++#endif ++} ++ ++void au_opts_free(struct au_opts *opts) ++{ ++ struct au_opt *opt; ++ ++ opt = opts->opt; ++ while (opt->type != Opt_tail) { ++ switch (opt->type) { ++ case Opt_add: ++ case Opt_append: ++ case Opt_prepend: ++ path_put(&opt->add.path); ++ break; ++ case Opt_del: ++ case Opt_idel: ++ path_put(&opt->del.h_path); ++ break; ++ case Opt_mod: ++ case Opt_imod: ++ dput(opt->mod.h_root); ++ break; ++ case Opt_xino: ++ fput(opt->xino.file); ++ break; ++ } ++ opt++; ++ } ++} ++ ++static int opt_add(struct au_opt *opt, char *opt_str, unsigned long sb_flags, ++ aufs_bindex_t bindex) ++{ ++ int err; ++ struct au_opt_add *add = &opt->add; ++ char *p; ++ ++ add->bindex = bindex; ++ add->perm = AuBrPerm_RO; ++ add->pathname = opt_str; ++ p = strchr(opt_str, '='); ++ if (p) { ++ *p++ = 0; ++ if (*p) ++ add->perm = br_perm_val(p); ++ } ++ ++ err = vfsub_kern_path(add->pathname, lkup_dirflags, &add->path); ++ if (!err) { ++ if (!p) { ++ add->perm = AuBrPerm_RO; ++ if (au_test_fs_rr(add->path.dentry->d_sb)) ++ add->perm = AuBrPerm_RR; ++ else if (!bindex && !(sb_flags & MS_RDONLY)) ++ add->perm = AuBrPerm_RW; ++ } ++ opt->type = Opt_add; ++ goto out; ++ } ++ pr_err("lookup failed %s (%d)\n", add->pathname, err); ++ err = -EINVAL; ++ ++out: ++ return err; ++} ++ ++static int au_opts_parse_del(struct au_opt_del *del, substring_t args[]) ++{ ++ int err; ++ ++ del->pathname = args[0].from; ++ AuDbg("del path %s\n", del->pathname); ++ ++ err = vfsub_kern_path(del->pathname, lkup_dirflags, &del->h_path); ++ if (unlikely(err)) ++ pr_err("lookup failed %s (%d)\n", del->pathname, err); ++ ++ return err; ++} ++ ++#if 0 /* reserved for future use */ ++static int au_opts_parse_idel(struct super_block *sb, aufs_bindex_t bindex, ++ struct au_opt_del *del, substring_t args[]) ++{ ++ int err; ++ struct dentry *root; ++ ++ err = -EINVAL; ++ root = sb->s_root; ++ aufs_read_lock(root, AuLock_FLUSH); ++ if (bindex < 0 || au_sbbot(sb) < bindex) { ++ pr_err("out of bounds, %d\n", bindex); ++ goto out; ++ } ++ ++ err = 0; ++ del->h_path.dentry = dget(au_h_dptr(root, bindex)); ++ del->h_path.mnt = mntget(au_sbr_mnt(sb, bindex)); ++ ++out: ++ aufs_read_unlock(root, !AuLock_IR); ++ return err; ++} ++#endif ++ ++static int noinline_for_stack ++au_opts_parse_mod(struct au_opt_mod *mod, substring_t args[]) ++{ ++ int err; ++ struct path path; ++ char *p; ++ ++ err = -EINVAL; ++ mod->path = args[0].from; ++ p = strchr(mod->path, '='); ++ if (unlikely(!p)) { ++ pr_err("no permssion %s\n", args[0].from); ++ goto out; ++ } ++ ++ *p++ = 0; ++ err = vfsub_kern_path(mod->path, lkup_dirflags, &path); ++ if (unlikely(err)) { ++ pr_err("lookup failed %s (%d)\n", mod->path, err); ++ goto out; ++ } ++ ++ mod->perm = br_perm_val(p); ++ AuDbg("mod path %s, perm 0x%x, %s\n", mod->path, mod->perm, p); ++ mod->h_root = dget(path.dentry); ++ path_put(&path); ++ ++out: ++ return err; ++} ++ ++#if 0 /* reserved for future use */ ++static int au_opts_parse_imod(struct super_block *sb, aufs_bindex_t bindex, ++ struct au_opt_mod *mod, substring_t args[]) ++{ ++ int err; ++ struct dentry *root; ++ ++ err = -EINVAL; ++ root = sb->s_root; ++ aufs_read_lock(root, AuLock_FLUSH); ++ if (bindex < 0 || au_sbbot(sb) < bindex) { ++ pr_err("out of bounds, %d\n", bindex); ++ goto out; ++ } ++ ++ err = 0; ++ mod->perm = br_perm_val(args[1].from); ++ AuDbg("mod path %s, perm 0x%x, %s\n", ++ mod->path, mod->perm, args[1].from); ++ mod->h_root = dget(au_h_dptr(root, bindex)); ++ ++out: ++ aufs_read_unlock(root, !AuLock_IR); ++ return err; ++} ++#endif ++ ++static int au_opts_parse_xino(struct super_block *sb, struct au_opt_xino *xino, ++ substring_t args[]) ++{ ++ int err; ++ struct file *file; ++ ++ file = au_xino_create(sb, args[0].from, /*silent*/0); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ ++ err = -EINVAL; ++ if (unlikely(file->f_path.dentry->d_sb == sb)) { ++ fput(file); ++ pr_err("%s must be outside\n", args[0].from); ++ goto out; ++ } ++ ++ err = 0; ++ xino->file = file; ++ xino->path = args[0].from; ++ ++out: ++ return err; ++} ++ ++static int noinline_for_stack ++au_opts_parse_xino_itrunc_path(struct super_block *sb, ++ struct au_opt_xino_itrunc *xino_itrunc, ++ substring_t args[]) ++{ ++ int err; ++ aufs_bindex_t bbot, bindex; ++ struct path path; ++ struct dentry *root; ++ ++ err = vfsub_kern_path(args[0].from, lkup_dirflags, &path); ++ if (unlikely(err)) { ++ pr_err("lookup failed %s (%d)\n", args[0].from, err); ++ goto out; ++ } ++ ++ xino_itrunc->bindex = -1; ++ root = sb->s_root; ++ aufs_read_lock(root, AuLock_FLUSH); ++ bbot = au_sbbot(sb); ++ for (bindex = 0; bindex <= bbot; bindex++) { ++ if (au_h_dptr(root, bindex) == path.dentry) { ++ xino_itrunc->bindex = bindex; ++ break; ++ } ++ } ++ aufs_read_unlock(root, !AuLock_IR); ++ path_put(&path); ++ ++ if (unlikely(xino_itrunc->bindex < 0)) { ++ pr_err("no such branch %s\n", args[0].from); ++ err = -EINVAL; ++ } ++ ++out: ++ return err; ++} ++ ++/* called without aufs lock */ ++int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts) ++{ ++ int err, n, token; ++ aufs_bindex_t bindex; ++ unsigned char skipped; ++ struct dentry *root; ++ struct au_opt *opt, *opt_tail; ++ char *opt_str; ++ /* reduce the stack space */ ++ union { ++ struct au_opt_xino_itrunc *xino_itrunc; ++ struct au_opt_wbr_create *create; ++ } u; ++ struct { ++ substring_t args[MAX_OPT_ARGS]; ++ } *a; ++ ++ err = -ENOMEM; ++ a = kmalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ root = sb->s_root; ++ err = 0; ++ bindex = 0; ++ opt = opts->opt; ++ opt_tail = opt + opts->max_opt - 1; ++ opt->type = Opt_tail; ++ while (!err && (opt_str = strsep(&str, ",")) && *opt_str) { ++ err = -EINVAL; ++ skipped = 0; ++ token = match_token(opt_str, options, a->args); ++ switch (token) { ++ case Opt_br: ++ err = 0; ++ while (!err && (opt_str = strsep(&a->args[0].from, ":")) ++ && *opt_str) { ++ err = opt_add(opt, opt_str, opts->sb_flags, ++ bindex++); ++ if (unlikely(!err && ++opt > opt_tail)) { ++ err = -E2BIG; ++ break; ++ } ++ opt->type = Opt_tail; ++ skipped = 1; ++ } ++ break; ++ case Opt_add: ++ if (unlikely(match_int(&a->args[0], &n))) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ bindex = n; ++ err = opt_add(opt, a->args[1].from, opts->sb_flags, ++ bindex); ++ if (!err) ++ opt->type = token; ++ break; ++ case Opt_append: ++ err = opt_add(opt, a->args[0].from, opts->sb_flags, ++ /*dummy bindex*/1); ++ if (!err) ++ opt->type = token; ++ break; ++ case Opt_prepend: ++ err = opt_add(opt, a->args[0].from, opts->sb_flags, ++ /*bindex*/0); ++ if (!err) ++ opt->type = token; ++ break; ++ case Opt_del: ++ err = au_opts_parse_del(&opt->del, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++#if 0 /* reserved for future use */ ++ case Opt_idel: ++ del->pathname = "(indexed)"; ++ if (unlikely(match_int(&args[0], &n))) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ err = au_opts_parse_idel(sb, n, &opt->del, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++#endif ++ case Opt_mod: ++ err = au_opts_parse_mod(&opt->mod, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++#ifdef IMOD /* reserved for future use */ ++ case Opt_imod: ++ u.mod->path = "(indexed)"; ++ if (unlikely(match_int(&a->args[0], &n))) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ err = au_opts_parse_imod(sb, n, &opt->mod, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++#endif ++ case Opt_xino: ++ err = au_opts_parse_xino(sb, &opt->xino, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++ ++ case Opt_trunc_xino_path: ++ err = au_opts_parse_xino_itrunc_path ++ (sb, &opt->xino_itrunc, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++ ++ case Opt_itrunc_xino: ++ u.xino_itrunc = &opt->xino_itrunc; ++ if (unlikely(match_int(&a->args[0], &n))) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ u.xino_itrunc->bindex = n; ++ aufs_read_lock(root, AuLock_FLUSH); ++ if (n < 0 || au_sbbot(sb) < n) { ++ pr_err("out of bounds, %d\n", n); ++ aufs_read_unlock(root, !AuLock_IR); ++ break; ++ } ++ aufs_read_unlock(root, !AuLock_IR); ++ err = 0; ++ opt->type = token; ++ break; ++ ++ case Opt_dirwh: ++ if (unlikely(match_int(&a->args[0], &opt->dirwh))) ++ break; ++ err = 0; ++ opt->type = token; ++ break; ++ ++ case Opt_rdcache: ++ if (unlikely(match_int(&a->args[0], &n))) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ if (unlikely(n > AUFS_RDCACHE_MAX)) { ++ pr_err("rdcache must be smaller than %d\n", ++ AUFS_RDCACHE_MAX); ++ break; ++ } ++ opt->rdcache = n; ++ err = 0; ++ opt->type = token; ++ break; ++ case Opt_rdblk: ++ if (unlikely(match_int(&a->args[0], &n) ++ || n < 0 ++ || n > KMALLOC_MAX_SIZE)) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ if (unlikely(n && n < NAME_MAX)) { ++ pr_err("rdblk must be larger than %d\n", ++ NAME_MAX); ++ break; ++ } ++ opt->rdblk = n; ++ err = 0; ++ opt->type = token; ++ break; ++ case Opt_rdhash: ++ if (unlikely(match_int(&a->args[0], &n) ++ || n < 0 ++ || n * sizeof(struct hlist_head) ++ > KMALLOC_MAX_SIZE)) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ opt->rdhash = n; ++ err = 0; ++ opt->type = token; ++ break; ++ ++ case Opt_trunc_xino: ++ case Opt_notrunc_xino: ++ case Opt_noxino: ++ case Opt_trunc_xib: ++ case Opt_notrunc_xib: ++ case Opt_shwh: ++ case Opt_noshwh: ++ case Opt_dirperm1: ++ case Opt_nodirperm1: ++ case Opt_plink: ++ case Opt_noplink: ++ case Opt_list_plink: ++ case Opt_dio: ++ case Opt_nodio: ++ case Opt_diropq_a: ++ case Opt_diropq_w: ++ case Opt_warn_perm: ++ case Opt_nowarn_perm: ++ case Opt_verbose: ++ case Opt_noverbose: ++ case Opt_sum: ++ case Opt_nosum: ++ case Opt_wsum: ++ case Opt_rdblk_def: ++ case Opt_rdhash_def: ++ case Opt_acl: ++ case Opt_noacl: ++ err = 0; ++ opt->type = token; ++ break; ++ ++ case Opt_udba: ++ opt->udba = udba_val(a->args[0].from); ++ if (opt->udba >= 0) { ++ err = 0; ++ opt->type = token; ++ } else ++ pr_err("wrong value, %s\n", opt_str); ++ break; ++ ++ case Opt_wbr_create: ++ u.create = &opt->wbr_create; ++ u.create->wbr_create ++ = au_wbr_create_val(a->args[0].from, u.create); ++ if (u.create->wbr_create >= 0) { ++ err = 0; ++ opt->type = token; ++ } else ++ pr_err("wrong value, %s\n", opt_str); ++ break; ++ case Opt_wbr_copyup: ++ opt->wbr_copyup = au_wbr_copyup_val(a->args[0].from); ++ if (opt->wbr_copyup >= 0) { ++ err = 0; ++ opt->type = token; ++ } else ++ pr_err("wrong value, %s\n", opt_str); ++ break; ++ ++ case Opt_fhsm_sec: ++ if (unlikely(match_int(&a->args[0], &n) ++ || n < 0)) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ if (sysaufs_brs) { ++ opt->fhsm_second = n; ++ opt->type = token; ++ } else ++ pr_warn("ignored %s\n", opt_str); ++ err = 0; ++ break; ++ ++ case Opt_ignore: ++ pr_warn("ignored %s\n", opt_str); ++ /*FALLTHROUGH*/ ++ case Opt_ignore_silent: ++ skipped = 1; ++ err = 0; ++ break; ++ case Opt_err: ++ pr_err("unknown option %s\n", opt_str); ++ break; ++ } ++ ++ if (!err && !skipped) { ++ if (unlikely(++opt > opt_tail)) { ++ err = -E2BIG; ++ opt--; ++ opt->type = Opt_tail; ++ break; ++ } ++ opt->type = Opt_tail; ++ } ++ } ++ ++ kfree(a); ++ dump_opts(opts); ++ if (unlikely(err)) ++ au_opts_free(opts); ++ ++out: ++ return err; ++} ++ ++static int au_opt_wbr_create(struct super_block *sb, ++ struct au_opt_wbr_create *create) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ err = 1; /* handled */ ++ sbinfo = au_sbi(sb); ++ if (sbinfo->si_wbr_create_ops->fin) { ++ err = sbinfo->si_wbr_create_ops->fin(sb); ++ if (!err) ++ err = 1; ++ } ++ ++ sbinfo->si_wbr_create = create->wbr_create; ++ sbinfo->si_wbr_create_ops = au_wbr_create_ops + create->wbr_create; ++ switch (create->wbr_create) { ++ case AuWbrCreate_MFSRRV: ++ case AuWbrCreate_MFSRR: ++ case AuWbrCreate_PMFSRR: ++ case AuWbrCreate_PMFSRRV: ++ sbinfo->si_wbr_mfs.mfsrr_watermark = create->mfsrr_watermark; ++ /*FALLTHROUGH*/ ++ case AuWbrCreate_MFS: ++ case AuWbrCreate_MFSV: ++ case AuWbrCreate_PMFS: ++ case AuWbrCreate_PMFSV: ++ sbinfo->si_wbr_mfs.mfs_expire ++ = msecs_to_jiffies(create->mfs_second * MSEC_PER_SEC); ++ break; ++ } ++ ++ if (sbinfo->si_wbr_create_ops->init) ++ sbinfo->si_wbr_create_ops->init(sb); /* ignore */ ++ ++ return err; ++} ++ ++/* ++ * returns, ++ * plus: processed without an error ++ * zero: unprocessed ++ */ ++static int au_opt_simple(struct super_block *sb, struct au_opt *opt, ++ struct au_opts *opts) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ err = 1; /* handled */ ++ sbinfo = au_sbi(sb); ++ switch (opt->type) { ++ case Opt_udba: ++ sbinfo->si_mntflags &= ~AuOptMask_UDBA; ++ sbinfo->si_mntflags |= opt->udba; ++ opts->given_udba |= opt->udba; ++ break; ++ ++ case Opt_plink: ++ au_opt_set(sbinfo->si_mntflags, PLINK); ++ break; ++ case Opt_noplink: ++ if (au_opt_test(sbinfo->si_mntflags, PLINK)) ++ au_plink_put(sb, /*verbose*/1); ++ au_opt_clr(sbinfo->si_mntflags, PLINK); ++ break; ++ case Opt_list_plink: ++ if (au_opt_test(sbinfo->si_mntflags, PLINK)) ++ au_plink_list(sb); ++ break; ++ ++ case Opt_dio: ++ au_opt_set(sbinfo->si_mntflags, DIO); ++ au_fset_opts(opts->flags, REFRESH_DYAOP); ++ break; ++ case Opt_nodio: ++ au_opt_clr(sbinfo->si_mntflags, DIO); ++ au_fset_opts(opts->flags, REFRESH_DYAOP); ++ break; ++ ++ case Opt_fhsm_sec: ++ au_fhsm_set(sbinfo, opt->fhsm_second); ++ break; ++ ++ case Opt_diropq_a: ++ au_opt_set(sbinfo->si_mntflags, ALWAYS_DIROPQ); ++ break; ++ case Opt_diropq_w: ++ au_opt_clr(sbinfo->si_mntflags, ALWAYS_DIROPQ); ++ break; ++ ++ case Opt_warn_perm: ++ au_opt_set(sbinfo->si_mntflags, WARN_PERM); ++ break; ++ case Opt_nowarn_perm: ++ au_opt_clr(sbinfo->si_mntflags, WARN_PERM); ++ break; ++ ++ case Opt_verbose: ++ au_opt_set(sbinfo->si_mntflags, VERBOSE); ++ break; ++ case Opt_noverbose: ++ au_opt_clr(sbinfo->si_mntflags, VERBOSE); ++ break; ++ ++ case Opt_sum: ++ au_opt_set(sbinfo->si_mntflags, SUM); ++ break; ++ case Opt_wsum: ++ au_opt_clr(sbinfo->si_mntflags, SUM); ++ au_opt_set(sbinfo->si_mntflags, SUM_W); ++ case Opt_nosum: ++ au_opt_clr(sbinfo->si_mntflags, SUM); ++ au_opt_clr(sbinfo->si_mntflags, SUM_W); ++ break; ++ ++ case Opt_wbr_create: ++ err = au_opt_wbr_create(sb, &opt->wbr_create); ++ break; ++ case Opt_wbr_copyup: ++ sbinfo->si_wbr_copyup = opt->wbr_copyup; ++ sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + opt->wbr_copyup; ++ break; ++ ++ case Opt_dirwh: ++ sbinfo->si_dirwh = opt->dirwh; ++ break; ++ ++ case Opt_rdcache: ++ sbinfo->si_rdcache ++ = msecs_to_jiffies(opt->rdcache * MSEC_PER_SEC); ++ break; ++ case Opt_rdblk: ++ sbinfo->si_rdblk = opt->rdblk; ++ break; ++ case Opt_rdblk_def: ++ sbinfo->si_rdblk = AUFS_RDBLK_DEF; ++ break; ++ case Opt_rdhash: ++ sbinfo->si_rdhash = opt->rdhash; ++ break; ++ case Opt_rdhash_def: ++ sbinfo->si_rdhash = AUFS_RDHASH_DEF; ++ break; ++ ++ case Opt_shwh: ++ au_opt_set(sbinfo->si_mntflags, SHWH); ++ break; ++ case Opt_noshwh: ++ au_opt_clr(sbinfo->si_mntflags, SHWH); ++ break; ++ ++ case Opt_dirperm1: ++ au_opt_set(sbinfo->si_mntflags, DIRPERM1); ++ break; ++ case Opt_nodirperm1: ++ au_opt_clr(sbinfo->si_mntflags, DIRPERM1); ++ break; ++ ++ case Opt_trunc_xino: ++ au_opt_set(sbinfo->si_mntflags, TRUNC_XINO); ++ break; ++ case Opt_notrunc_xino: ++ au_opt_clr(sbinfo->si_mntflags, TRUNC_XINO); ++ break; ++ ++ case Opt_trunc_xino_path: ++ case Opt_itrunc_xino: ++ err = au_xino_trunc(sb, opt->xino_itrunc.bindex); ++ if (!err) ++ err = 1; ++ break; ++ ++ case Opt_trunc_xib: ++ au_fset_opts(opts->flags, TRUNC_XIB); ++ break; ++ case Opt_notrunc_xib: ++ au_fclr_opts(opts->flags, TRUNC_XIB); ++ break; ++ ++ case Opt_acl: ++ sb->s_flags |= MS_POSIXACL; ++ break; ++ case Opt_noacl: ++ sb->s_flags &= ~MS_POSIXACL; ++ break; ++ ++ default: ++ err = 0; ++ break; ++ } ++ ++ return err; ++} ++ ++/* ++ * returns tri-state. ++ * plus: processed without an error ++ * zero: unprocessed ++ * minus: error ++ */ ++static int au_opt_br(struct super_block *sb, struct au_opt *opt, ++ struct au_opts *opts) ++{ ++ int err, do_refresh; ++ ++ err = 0; ++ switch (opt->type) { ++ case Opt_append: ++ opt->add.bindex = au_sbbot(sb) + 1; ++ if (opt->add.bindex < 0) ++ opt->add.bindex = 0; ++ goto add; ++ case Opt_prepend: ++ opt->add.bindex = 0; ++ add: /* indented label */ ++ case Opt_add: ++ err = au_br_add(sb, &opt->add, ++ au_ftest_opts(opts->flags, REMOUNT)); ++ if (!err) { ++ err = 1; ++ au_fset_opts(opts->flags, REFRESH); ++ } ++ break; ++ ++ case Opt_del: ++ case Opt_idel: ++ err = au_br_del(sb, &opt->del, ++ au_ftest_opts(opts->flags, REMOUNT)); ++ if (!err) { ++ err = 1; ++ au_fset_opts(opts->flags, TRUNC_XIB); ++ au_fset_opts(opts->flags, REFRESH); ++ } ++ break; ++ ++ case Opt_mod: ++ case Opt_imod: ++ err = au_br_mod(sb, &opt->mod, ++ au_ftest_opts(opts->flags, REMOUNT), ++ &do_refresh); ++ if (!err) { ++ err = 1; ++ if (do_refresh) ++ au_fset_opts(opts->flags, REFRESH); ++ } ++ break; ++ } ++ ++ return err; ++} ++ ++static int au_opt_xino(struct super_block *sb, struct au_opt *opt, ++ struct au_opt_xino **opt_xino, ++ struct au_opts *opts) ++{ ++ int err; ++ aufs_bindex_t bbot, bindex; ++ struct dentry *root, *parent, *h_root; ++ ++ err = 0; ++ switch (opt->type) { ++ case Opt_xino: ++ err = au_xino_set(sb, &opt->xino, ++ !!au_ftest_opts(opts->flags, REMOUNT)); ++ if (unlikely(err)) ++ break; ++ ++ *opt_xino = &opt->xino; ++ au_xino_brid_set(sb, -1); ++ ++ /* safe d_parent access */ ++ parent = opt->xino.file->f_path.dentry->d_parent; ++ root = sb->s_root; ++ bbot = au_sbbot(sb); ++ for (bindex = 0; bindex <= bbot; bindex++) { ++ h_root = au_h_dptr(root, bindex); ++ if (h_root == parent) { ++ au_xino_brid_set(sb, au_sbr_id(sb, bindex)); ++ break; ++ } ++ } ++ break; ++ ++ case Opt_noxino: ++ au_xino_clr(sb); ++ au_xino_brid_set(sb, -1); ++ *opt_xino = (void *)-1; ++ break; ++ } ++ ++ return err; ++} ++ ++int au_opts_verify(struct super_block *sb, unsigned long sb_flags, ++ unsigned int pending) ++{ ++ int err, fhsm; ++ aufs_bindex_t bindex, bbot; ++ unsigned char do_plink, skip, do_free, can_no_dreval; ++ struct au_branch *br; ++ struct au_wbr *wbr; ++ struct dentry *root, *dentry; ++ struct inode *dir, *h_dir; ++ struct au_sbinfo *sbinfo; ++ struct au_hinode *hdir; ++ ++ SiMustAnyLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!(sbinfo->si_mntflags & AuOptMask_UDBA)); ++ ++ if (!(sb_flags & MS_RDONLY)) { ++ if (unlikely(!au_br_writable(au_sbr_perm(sb, 0)))) ++ pr_warn("first branch should be rw\n"); ++ if (unlikely(au_opt_test(sbinfo->si_mntflags, SHWH))) ++ pr_warn_once("shwh should be used with ro\n"); ++ } ++ ++ if (au_opt_test((sbinfo->si_mntflags | pending), UDBA_HNOTIFY) ++ && !au_opt_test(sbinfo->si_mntflags, XINO)) ++ pr_warn_once("udba=*notify requires xino\n"); ++ ++ if (au_opt_test(sbinfo->si_mntflags, DIRPERM1)) ++ pr_warn_once("dirperm1 breaks the protection" ++ " by the permission bits on the lower branch\n"); ++ ++ err = 0; ++ fhsm = 0; ++ root = sb->s_root; ++ dir = d_inode(root); ++ do_plink = !!au_opt_test(sbinfo->si_mntflags, PLINK); ++ can_no_dreval = !!au_opt_test((sbinfo->si_mntflags | pending), ++ UDBA_NONE); ++ bbot = au_sbbot(sb); ++ for (bindex = 0; !err && bindex <= bbot; bindex++) { ++ skip = 0; ++ h_dir = au_h_iptr(dir, bindex); ++ br = au_sbr(sb, bindex); ++ ++ if ((br->br_perm & AuBrAttr_ICEX) ++ && !h_dir->i_op->listxattr) ++ br->br_perm &= ~AuBrAttr_ICEX; ++#if 0 ++ if ((br->br_perm & AuBrAttr_ICEX_SEC) ++ && (au_br_sb(br)->s_flags & MS_NOSEC)) ++ br->br_perm &= ~AuBrAttr_ICEX_SEC; ++#endif ++ ++ do_free = 0; ++ wbr = br->br_wbr; ++ if (wbr) ++ wbr_wh_read_lock(wbr); ++ ++ if (!au_br_writable(br->br_perm)) { ++ do_free = !!wbr; ++ skip = (!wbr ++ || (!wbr->wbr_whbase ++ && !wbr->wbr_plink ++ && !wbr->wbr_orph)); ++ } else if (!au_br_wh_linkable(br->br_perm)) { ++ /* skip = (!br->br_whbase && !br->br_orph); */ ++ skip = (!wbr || !wbr->wbr_whbase); ++ if (skip && wbr) { ++ if (do_plink) ++ skip = !!wbr->wbr_plink; ++ else ++ skip = !wbr->wbr_plink; ++ } ++ } else { ++ /* skip = (br->br_whbase && br->br_ohph); */ ++ skip = (wbr && wbr->wbr_whbase); ++ if (skip) { ++ if (do_plink) ++ skip = !!wbr->wbr_plink; ++ else ++ skip = !wbr->wbr_plink; ++ } ++ } ++ if (wbr) ++ wbr_wh_read_unlock(wbr); ++ ++ if (can_no_dreval) { ++ dentry = br->br_path.dentry; ++ spin_lock(&dentry->d_lock); ++ if (dentry->d_flags & ++ (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE)) ++ can_no_dreval = 0; ++ spin_unlock(&dentry->d_lock); ++ } ++ ++ if (au_br_fhsm(br->br_perm)) { ++ fhsm++; ++ AuDebugOn(!br->br_fhsm); ++ } ++ ++ if (skip) ++ continue; ++ ++ hdir = au_hi(dir, bindex); ++ au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ if (wbr) ++ wbr_wh_write_lock(wbr); ++ err = au_wh_init(br, sb); ++ if (wbr) ++ wbr_wh_write_unlock(wbr); ++ au_hn_imtx_unlock(hdir); ++ ++ if (!err && do_free) { ++ kfree(wbr); ++ br->br_wbr = NULL; ++ } ++ } ++ ++ if (can_no_dreval) ++ au_fset_si(sbinfo, NO_DREVAL); ++ else ++ au_fclr_si(sbinfo, NO_DREVAL); ++ ++ if (fhsm >= 2) { ++ au_fset_si(sbinfo, FHSM); ++ for (bindex = bbot; bindex >= 0; bindex--) { ++ br = au_sbr(sb, bindex); ++ if (au_br_fhsm(br->br_perm)) { ++ au_fhsm_set_bottom(sb, bindex); ++ break; ++ } ++ } ++ } else { ++ au_fclr_si(sbinfo, FHSM); ++ au_fhsm_set_bottom(sb, -1); ++ } ++ ++ return err; ++} ++ ++int au_opts_mount(struct super_block *sb, struct au_opts *opts) ++{ ++ int err; ++ unsigned int tmp; ++ aufs_bindex_t bindex, bbot; ++ struct au_opt *opt; ++ struct au_opt_xino *opt_xino, xino; ++ struct au_sbinfo *sbinfo; ++ struct au_branch *br; ++ struct inode *dir; ++ ++ SiMustWriteLock(sb); ++ ++ err = 0; ++ opt_xino = NULL; ++ opt = opts->opt; ++ while (err >= 0 && opt->type != Opt_tail) ++ err = au_opt_simple(sb, opt++, opts); ++ if (err > 0) ++ err = 0; ++ else if (unlikely(err < 0)) ++ goto out; ++ ++ /* disable xino and udba temporary */ ++ sbinfo = au_sbi(sb); ++ tmp = sbinfo->si_mntflags; ++ au_opt_clr(sbinfo->si_mntflags, XINO); ++ au_opt_set_udba(sbinfo->si_mntflags, UDBA_REVAL); ++ ++ opt = opts->opt; ++ while (err >= 0 && opt->type != Opt_tail) ++ err = au_opt_br(sb, opt++, opts); ++ if (err > 0) ++ err = 0; ++ else if (unlikely(err < 0)) ++ goto out; ++ ++ bbot = au_sbbot(sb); ++ if (unlikely(bbot < 0)) { ++ err = -EINVAL; ++ pr_err("no branches\n"); ++ goto out; ++ } ++ ++ if (au_opt_test(tmp, XINO)) ++ au_opt_set(sbinfo->si_mntflags, XINO); ++ opt = opts->opt; ++ while (!err && opt->type != Opt_tail) ++ err = au_opt_xino(sb, opt++, &opt_xino, opts); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_opts_verify(sb, sb->s_flags, tmp); ++ if (unlikely(err)) ++ goto out; ++ ++ /* restore xino */ ++ if (au_opt_test(tmp, XINO) && !opt_xino) { ++ xino.file = au_xino_def(sb); ++ err = PTR_ERR(xino.file); ++ if (IS_ERR(xino.file)) ++ goto out; ++ ++ err = au_xino_set(sb, &xino, /*remount*/0); ++ fput(xino.file); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ /* restore udba */ ++ tmp &= AuOptMask_UDBA; ++ sbinfo->si_mntflags &= ~AuOptMask_UDBA; ++ sbinfo->si_mntflags |= tmp; ++ bbot = au_sbbot(sb); ++ for (bindex = 0; bindex <= bbot; bindex++) { ++ br = au_sbr(sb, bindex); ++ err = au_hnotify_reset_br(tmp, br, br->br_perm); ++ if (unlikely(err)) ++ AuIOErr("hnotify failed on br %d, %d, ignored\n", ++ bindex, err); ++ /* go on even if err */ ++ } ++ if (au_opt_test(tmp, UDBA_HNOTIFY)) { ++ dir = d_inode(sb->s_root); ++ au_hn_reset(dir, au_hi_flags(dir, /*isdir*/1) & ~AuHi_XINO); ++ } ++ ++out: ++ return err; ++} ++ ++int au_opts_remount(struct super_block *sb, struct au_opts *opts) ++{ ++ int err, rerr; ++ unsigned char no_dreval; ++ struct inode *dir; ++ struct au_opt_xino *opt_xino; ++ struct au_opt *opt; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ err = 0; ++ dir = d_inode(sb->s_root); ++ sbinfo = au_sbi(sb); ++ opt_xino = NULL; ++ opt = opts->opt; ++ while (err >= 0 && opt->type != Opt_tail) { ++ err = au_opt_simple(sb, opt, opts); ++ if (!err) ++ err = au_opt_br(sb, opt, opts); ++ if (!err) ++ err = au_opt_xino(sb, opt, &opt_xino, opts); ++ opt++; ++ } ++ if (err > 0) ++ err = 0; ++ AuTraceErr(err); ++ /* go on even err */ ++ ++ no_dreval = !!au_ftest_si(sbinfo, NO_DREVAL); ++ rerr = au_opts_verify(sb, opts->sb_flags, /*pending*/0); ++ if (unlikely(rerr && !err)) ++ err = rerr; ++ ++ if (no_dreval != !!au_ftest_si(sbinfo, NO_DREVAL)) ++ au_fset_opts(opts->flags, REFRESH_IDOP); ++ ++ if (au_ftest_opts(opts->flags, TRUNC_XIB)) { ++ rerr = au_xib_trunc(sb); ++ if (unlikely(rerr && !err)) ++ err = rerr; ++ } ++ ++ /* will be handled by the caller */ ++ if (!au_ftest_opts(opts->flags, REFRESH) ++ && (opts->given_udba ++ || au_opt_test(sbinfo->si_mntflags, XINO) ++ || au_ftest_opts(opts->flags, REFRESH_IDOP) ++ )) ++ au_fset_opts(opts->flags, REFRESH); ++ ++ AuDbg("status 0x%x\n", opts->flags); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++unsigned int au_opt_udba(struct super_block *sb) ++{ ++ return au_mntflags(sb) & AuOptMask_UDBA; ++} +diff --git a/fs/aufs/opts.h b/fs/aufs/opts.h +new file mode 100644 +index 0000000..8d0c534 +--- /dev/null ++++ b/fs/aufs/opts.h +@@ -0,0 +1,211 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * mount options/flags ++ */ ++ ++#ifndef __AUFS_OPTS_H__ ++#define __AUFS_OPTS_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++struct file; ++struct super_block; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* mount flags */ ++#define AuOpt_XINO 1 /* external inode number bitmap ++ and translation table */ ++#define AuOpt_TRUNC_XINO (1 << 1) /* truncate xino files */ ++#define AuOpt_UDBA_NONE (1 << 2) /* users direct branch access */ ++#define AuOpt_UDBA_REVAL (1 << 3) ++#define AuOpt_UDBA_HNOTIFY (1 << 4) ++#define AuOpt_SHWH (1 << 5) /* show whiteout */ ++#define AuOpt_PLINK (1 << 6) /* pseudo-link */ ++#define AuOpt_DIRPERM1 (1 << 7) /* ignore the lower dir's perm ++ bits */ ++#define AuOpt_ALWAYS_DIROPQ (1 << 9) /* policy to creating diropq */ ++#define AuOpt_SUM (1 << 10) /* summation for statfs(2) */ ++#define AuOpt_SUM_W (1 << 11) /* unimplemented */ ++#define AuOpt_WARN_PERM (1 << 12) /* warn when add-branch */ ++#define AuOpt_VERBOSE (1 << 13) /* busy inode when del-branch */ ++#define AuOpt_DIO (1 << 14) /* direct io */ ++ ++#ifndef CONFIG_AUFS_HNOTIFY ++#undef AuOpt_UDBA_HNOTIFY ++#define AuOpt_UDBA_HNOTIFY 0 ++#endif ++#ifndef CONFIG_AUFS_SHWH ++#undef AuOpt_SHWH ++#define AuOpt_SHWH 0 ++#endif ++ ++#define AuOpt_Def (AuOpt_XINO \ ++ | AuOpt_UDBA_REVAL \ ++ | AuOpt_PLINK \ ++ /* | AuOpt_DIRPERM1 */ \ ++ | AuOpt_WARN_PERM) ++#define AuOptMask_UDBA (AuOpt_UDBA_NONE \ ++ | AuOpt_UDBA_REVAL \ ++ | AuOpt_UDBA_HNOTIFY) ++ ++#define au_opt_test(flags, name) (flags & AuOpt_##name) ++#define au_opt_set(flags, name) do { \ ++ BUILD_BUG_ON(AuOpt_##name & AuOptMask_UDBA); \ ++ ((flags) |= AuOpt_##name); \ ++} while (0) ++#define au_opt_set_udba(flags, name) do { \ ++ (flags) &= ~AuOptMask_UDBA; \ ++ ((flags) |= AuOpt_##name); \ ++} while (0) ++#define au_opt_clr(flags, name) do { \ ++ ((flags) &= ~AuOpt_##name); \ ++} while (0) ++ ++static inline unsigned int au_opts_plink(unsigned int mntflags) ++{ ++#ifdef CONFIG_PROC_FS ++ return mntflags; ++#else ++ return mntflags & ~AuOpt_PLINK; ++#endif ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* policies to select one among multiple writable branches */ ++enum { ++ AuWbrCreate_TDP, /* top down parent */ ++ AuWbrCreate_RR, /* round robin */ ++ AuWbrCreate_MFS, /* most free space */ ++ AuWbrCreate_MFSV, /* mfs with seconds */ ++ AuWbrCreate_MFSRR, /* mfs then rr */ ++ AuWbrCreate_MFSRRV, /* mfs then rr with seconds */ ++ AuWbrCreate_PMFS, /* parent and mfs */ ++ AuWbrCreate_PMFSV, /* parent and mfs with seconds */ ++ AuWbrCreate_PMFSRR, /* parent, mfs and round-robin */ ++ AuWbrCreate_PMFSRRV, /* plus seconds */ ++ ++ AuWbrCreate_Def = AuWbrCreate_TDP ++}; ++ ++enum { ++ AuWbrCopyup_TDP, /* top down parent */ ++ AuWbrCopyup_BUP, /* bottom up parent */ ++ AuWbrCopyup_BU, /* bottom up */ ++ ++ AuWbrCopyup_Def = AuWbrCopyup_TDP ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_opt_add { ++ aufs_bindex_t bindex; ++ char *pathname; ++ int perm; ++ struct path path; ++}; ++ ++struct au_opt_del { ++ char *pathname; ++ struct path h_path; ++}; ++ ++struct au_opt_mod { ++ char *path; ++ int perm; ++ struct dentry *h_root; ++}; ++ ++struct au_opt_xino { ++ char *path; ++ struct file *file; ++}; ++ ++struct au_opt_xino_itrunc { ++ aufs_bindex_t bindex; ++}; ++ ++struct au_opt_wbr_create { ++ int wbr_create; ++ int mfs_second; ++ unsigned long long mfsrr_watermark; ++}; ++ ++struct au_opt { ++ int type; ++ union { ++ struct au_opt_xino xino; ++ struct au_opt_xino_itrunc xino_itrunc; ++ struct au_opt_add add; ++ struct au_opt_del del; ++ struct au_opt_mod mod; ++ int dirwh; ++ int rdcache; ++ unsigned int rdblk; ++ unsigned int rdhash; ++ int udba; ++ struct au_opt_wbr_create wbr_create; ++ int wbr_copyup; ++ unsigned int fhsm_second; ++ }; ++}; ++ ++/* opts flags */ ++#define AuOpts_REMOUNT 1 ++#define AuOpts_REFRESH (1 << 1) ++#define AuOpts_TRUNC_XIB (1 << 2) ++#define AuOpts_REFRESH_DYAOP (1 << 3) ++#define AuOpts_REFRESH_IDOP (1 << 4) ++#define au_ftest_opts(flags, name) ((flags) & AuOpts_##name) ++#define au_fset_opts(flags, name) \ ++ do { (flags) |= AuOpts_##name; } while (0) ++#define au_fclr_opts(flags, name) \ ++ do { (flags) &= ~AuOpts_##name; } while (0) ++ ++struct au_opts { ++ struct au_opt *opt; ++ int max_opt; ++ ++ unsigned int given_udba; ++ unsigned int flags; ++ unsigned long sb_flags; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* opts.c */ ++void au_optstr_br_perm(au_br_perm_str_t *str, int perm); ++const char *au_optstr_udba(int udba); ++const char *au_optstr_wbr_copyup(int wbr_copyup); ++const char *au_optstr_wbr_create(int wbr_create); ++ ++void au_opts_free(struct au_opts *opts); ++int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts); ++int au_opts_verify(struct super_block *sb, unsigned long sb_flags, ++ unsigned int pending); ++int au_opts_mount(struct super_block *sb, struct au_opts *opts); ++int au_opts_remount(struct super_block *sb, struct au_opts *opts); ++ ++unsigned int au_opt_udba(struct super_block *sb); ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_OPTS_H__ */ +diff --git a/fs/aufs/plink.c b/fs/aufs/plink.c +new file mode 100644 +index 0000000..a565a11 +--- /dev/null ++++ b/fs/aufs/plink.c +@@ -0,0 +1,502 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * pseudo-link ++ */ ++ ++#include "aufs.h" ++ ++/* ++ * the pseudo-link maintenance mode. ++ * during a user process maintains the pseudo-links, ++ * prohibit adding a new plink and branch manipulation. ++ * ++ * Flags ++ * NOPLM: ++ * For entry functions which will handle plink, and i_mutex is already held ++ * in VFS. ++ * They cannot wait and should return an error at once. ++ * Callers has to check the error. ++ * NOPLMW: ++ * For entry functions which will handle plink, but i_mutex is not held ++ * in VFS. ++ * They can wait the plink maintenance mode to finish. ++ * ++ * They behave like F_SETLK and F_SETLKW. ++ * If the caller never handle plink, then both flags are unnecessary. ++ */ ++ ++int au_plink_maint(struct super_block *sb, int flags) ++{ ++ int err; ++ pid_t pid, ppid; ++ struct au_sbinfo *sbi; ++ ++ SiMustAnyLock(sb); ++ ++ err = 0; ++ if (!au_opt_test(au_mntflags(sb), PLINK)) ++ goto out; ++ ++ sbi = au_sbi(sb); ++ pid = sbi->si_plink_maint_pid; ++ if (!pid || pid == current->pid) ++ goto out; ++ ++ /* todo: it highly depends upon /sbin/mount.aufs */ ++ rcu_read_lock(); ++ ppid = task_pid_vnr(rcu_dereference(current->real_parent)); ++ rcu_read_unlock(); ++ if (pid == ppid) ++ goto out; ++ ++ if (au_ftest_lock(flags, NOPLMW)) { ++ /* if there is no i_mutex lock in VFS, we don't need to wait */ ++ /* AuDebugOn(!lockdep_depth(current)); */ ++ while (sbi->si_plink_maint_pid) { ++ si_read_unlock(sb); ++ /* gave up wake_up_bit() */ ++ wait_event(sbi->si_plink_wq, !sbi->si_plink_maint_pid); ++ ++ if (au_ftest_lock(flags, FLUSH)) ++ au_nwt_flush(&sbi->si_nowait); ++ si_noflush_read_lock(sb); ++ } ++ } else if (au_ftest_lock(flags, NOPLM)) { ++ AuDbg("ppid %d, pid %d\n", ppid, pid); ++ err = -EAGAIN; ++ } ++ ++out: ++ return err; ++} ++ ++void au_plink_maint_leave(struct au_sbinfo *sbinfo) ++{ ++ spin_lock(&sbinfo->si_plink_maint_lock); ++ sbinfo->si_plink_maint_pid = 0; ++ spin_unlock(&sbinfo->si_plink_maint_lock); ++ wake_up_all(&sbinfo->si_plink_wq); ++} ++ ++int au_plink_maint_enter(struct super_block *sb) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ /* make sure i am the only one in this fs */ ++ si_write_lock(sb, AuLock_FLUSH); ++ if (au_opt_test(au_mntflags(sb), PLINK)) { ++ spin_lock(&sbinfo->si_plink_maint_lock); ++ if (!sbinfo->si_plink_maint_pid) ++ sbinfo->si_plink_maint_pid = current->pid; ++ else ++ err = -EBUSY; ++ spin_unlock(&sbinfo->si_plink_maint_lock); ++ } ++ si_write_unlock(sb); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_DEBUG ++void au_plink_list(struct super_block *sb) ++{ ++ int i; ++ struct au_sbinfo *sbinfo; ++ struct hlist_head *plink_hlist; ++ struct au_icntnr *icntnr; ++ ++ SiMustAnyLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); ++ AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); ++ ++ for (i = 0; i < AuPlink_NHASH; i++) { ++ plink_hlist = &sbinfo->si_plink[i].head; ++ rcu_read_lock(); ++ hlist_for_each_entry_rcu(icntnr, plink_hlist, plink) ++ AuDbg("%lu\n", icntnr->vfs_inode.i_ino); ++ rcu_read_unlock(); ++ } ++} ++#endif ++ ++/* is the inode pseudo-linked? */ ++int au_plink_test(struct inode *inode) ++{ ++ int found, i; ++ struct au_sbinfo *sbinfo; ++ struct hlist_head *plink_hlist; ++ struct au_icntnr *icntnr; ++ ++ sbinfo = au_sbi(inode->i_sb); ++ AuRwMustAnyLock(&sbinfo->si_rwsem); ++ AuDebugOn(!au_opt_test(au_mntflags(inode->i_sb), PLINK)); ++ AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM)); ++ ++ found = 0; ++ i = au_plink_hash(inode->i_ino); ++ plink_hlist = &sbinfo->si_plink[i].head; ++ rcu_read_lock(); ++ hlist_for_each_entry_rcu(icntnr, plink_hlist, plink) ++ if (&icntnr->vfs_inode == inode) { ++ found = 1; ++ break; ++ } ++ rcu_read_unlock(); ++ return found; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * generate a name for plink. ++ * the file will be stored under AUFS_WH_PLINKDIR. ++ */ ++/* 20 is max digits length of ulong 64 */ ++#define PLINK_NAME_LEN ((20 + 1) * 2) ++ ++static int plink_name(char *name, int len, struct inode *inode, ++ aufs_bindex_t bindex) ++{ ++ int rlen; ++ struct inode *h_inode; ++ ++ h_inode = au_h_iptr(inode, bindex); ++ rlen = snprintf(name, len, "%lu.%lu", inode->i_ino, h_inode->i_ino); ++ return rlen; ++} ++ ++struct au_do_plink_lkup_args { ++ struct dentry **errp; ++ struct qstr *tgtname; ++ struct dentry *h_parent; ++ struct au_branch *br; ++}; ++ ++static struct dentry *au_do_plink_lkup(struct qstr *tgtname, ++ struct dentry *h_parent, ++ struct au_branch *br) ++{ ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ ++ h_inode = d_inode(h_parent); ++ inode_lock_nested(h_inode, AuLsc_I_CHILD2); ++ h_dentry = vfsub_lkup_one(tgtname, h_parent); ++ inode_unlock(h_inode); ++ return h_dentry; ++} ++ ++static void au_call_do_plink_lkup(void *args) ++{ ++ struct au_do_plink_lkup_args *a = args; ++ *a->errp = au_do_plink_lkup(a->tgtname, a->h_parent, a->br); ++} ++ ++/* lookup the plink-ed @inode under the branch at @bindex */ ++struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex) ++{ ++ struct dentry *h_dentry, *h_parent; ++ struct au_branch *br; ++ int wkq_err; ++ char a[PLINK_NAME_LEN]; ++ struct qstr tgtname = QSTR_INIT(a, 0); ++ ++ AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM)); ++ ++ br = au_sbr(inode->i_sb, bindex); ++ h_parent = br->br_wbr->wbr_plink; ++ tgtname.len = plink_name(a, sizeof(a), inode, bindex); ++ ++ if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) { ++ struct au_do_plink_lkup_args args = { ++ .errp = &h_dentry, ++ .tgtname = &tgtname, ++ .h_parent = h_parent, ++ .br = br ++ }; ++ ++ wkq_err = au_wkq_wait(au_call_do_plink_lkup, &args); ++ if (unlikely(wkq_err)) ++ h_dentry = ERR_PTR(wkq_err); ++ } else ++ h_dentry = au_do_plink_lkup(&tgtname, h_parent, br); ++ ++ return h_dentry; ++} ++ ++/* create a pseudo-link */ ++static int do_whplink(struct qstr *tgt, struct dentry *h_parent, ++ struct dentry *h_dentry, struct au_branch *br) ++{ ++ int err; ++ struct path h_path = { ++ .mnt = au_br_mnt(br) ++ }; ++ struct inode *h_dir, *delegated; ++ ++ h_dir = d_inode(h_parent); ++ inode_lock_nested(h_dir, AuLsc_I_CHILD2); ++again: ++ h_path.dentry = vfsub_lkup_one(tgt, h_parent); ++ err = PTR_ERR(h_path.dentry); ++ if (IS_ERR(h_path.dentry)) ++ goto out; ++ ++ err = 0; ++ /* wh.plink dir is not monitored */ ++ /* todo: is it really safe? */ ++ if (d_is_positive(h_path.dentry) ++ && d_inode(h_path.dentry) != d_inode(h_dentry)) { ++ delegated = NULL; ++ err = vfsub_unlink(h_dir, &h_path, &delegated, /*force*/0); ++ if (unlikely(err == -EWOULDBLOCK)) { ++ pr_warn("cannot retry for NFSv4 delegation" ++ " for an internal unlink\n"); ++ iput(delegated); ++ } ++ dput(h_path.dentry); ++ h_path.dentry = NULL; ++ if (!err) ++ goto again; ++ } ++ if (!err && d_is_negative(h_path.dentry)) { ++ delegated = NULL; ++ err = vfsub_link(h_dentry, h_dir, &h_path, &delegated); ++ if (unlikely(err == -EWOULDBLOCK)) { ++ pr_warn("cannot retry for NFSv4 delegation" ++ " for an internal link\n"); ++ iput(delegated); ++ } ++ } ++ dput(h_path.dentry); ++ ++out: ++ inode_unlock(h_dir); ++ return err; ++} ++ ++struct do_whplink_args { ++ int *errp; ++ struct qstr *tgt; ++ struct dentry *h_parent; ++ struct dentry *h_dentry; ++ struct au_branch *br; ++}; ++ ++static void call_do_whplink(void *args) ++{ ++ struct do_whplink_args *a = args; ++ *a->errp = do_whplink(a->tgt, a->h_parent, a->h_dentry, a->br); ++} ++ ++static int whplink(struct dentry *h_dentry, struct inode *inode, ++ aufs_bindex_t bindex, struct au_branch *br) ++{ ++ int err, wkq_err; ++ struct au_wbr *wbr; ++ struct dentry *h_parent; ++ char a[PLINK_NAME_LEN]; ++ struct qstr tgtname = QSTR_INIT(a, 0); ++ ++ wbr = au_sbr(inode->i_sb, bindex)->br_wbr; ++ h_parent = wbr->wbr_plink; ++ tgtname.len = plink_name(a, sizeof(a), inode, bindex); ++ ++ /* always superio. */ ++ if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) { ++ struct do_whplink_args args = { ++ .errp = &err, ++ .tgt = &tgtname, ++ .h_parent = h_parent, ++ .h_dentry = h_dentry, ++ .br = br ++ }; ++ wkq_err = au_wkq_wait(call_do_whplink, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } else ++ err = do_whplink(&tgtname, h_parent, h_dentry, br); ++ ++ return err; ++} ++ ++/* ++ * create a new pseudo-link for @h_dentry on @bindex. ++ * the linked inode is held in aufs @inode. ++ */ ++void au_plink_append(struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_dentry) ++{ ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ struct hlist_head *plink_hlist; ++ struct au_icntnr *icntnr; ++ struct au_sphlhead *sphl; ++ int found, err, cnt, i; ++ ++ sb = inode->i_sb; ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); ++ AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); ++ ++ found = au_plink_test(inode); ++ if (found) ++ return; ++ ++ i = au_plink_hash(inode->i_ino); ++ sphl = sbinfo->si_plink + i; ++ plink_hlist = &sphl->head; ++ au_igrab(inode); ++ ++ spin_lock(&sphl->spin); ++ hlist_for_each_entry(icntnr, plink_hlist, plink) { ++ if (&icntnr->vfs_inode == inode) { ++ found = 1; ++ break; ++ } ++ } ++ if (!found) { ++ icntnr = container_of(inode, struct au_icntnr, vfs_inode); ++ hlist_add_head_rcu(&icntnr->plink, plink_hlist); ++ } ++ spin_unlock(&sphl->spin); ++ if (!found) { ++ cnt = au_sphl_count(sphl); ++#define msg "unexpectedly unblanced or too many pseudo-links" ++ if (cnt > AUFS_PLINK_WARN) ++ AuWarn1(msg ", %d\n", cnt); ++#undef msg ++ err = whplink(h_dentry, inode, bindex, au_sbr(sb, bindex)); ++ if (unlikely(err)) { ++ pr_warn("err %d, damaged pseudo link.\n", err); ++ au_sphl_del_rcu(&icntnr->plink, sphl); ++ iput(&icntnr->vfs_inode); ++ } ++ } else ++ iput(&icntnr->vfs_inode); ++} ++ ++/* free all plinks */ ++void au_plink_put(struct super_block *sb, int verbose) ++{ ++ int i, warned; ++ struct au_sbinfo *sbinfo; ++ struct hlist_head *plink_hlist; ++ struct hlist_node *tmp; ++ struct au_icntnr *icntnr; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); ++ AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); ++ ++ /* no spin_lock since sbinfo is write-locked */ ++ warned = 0; ++ for (i = 0; i < AuPlink_NHASH; i++) { ++ plink_hlist = &sbinfo->si_plink[i].head; ++ if (!warned && verbose && !hlist_empty(plink_hlist)) { ++ pr_warn("pseudo-link is not flushed"); ++ warned = 1; ++ } ++ hlist_for_each_entry_safe(icntnr, tmp, plink_hlist, plink) ++ iput(&icntnr->vfs_inode); ++ INIT_HLIST_HEAD(plink_hlist); ++ } ++} ++ ++void au_plink_clean(struct super_block *sb, int verbose) ++{ ++ struct dentry *root; ++ ++ root = sb->s_root; ++ aufs_write_lock(root); ++ if (au_opt_test(au_mntflags(sb), PLINK)) ++ au_plink_put(sb, verbose); ++ aufs_write_unlock(root); ++} ++ ++static int au_plink_do_half_refresh(struct inode *inode, aufs_bindex_t br_id) ++{ ++ int do_put; ++ aufs_bindex_t btop, bbot, bindex; ++ ++ do_put = 0; ++ btop = au_ibtop(inode); ++ bbot = au_ibbot(inode); ++ if (btop >= 0) { ++ for (bindex = btop; bindex <= bbot; bindex++) { ++ if (!au_h_iptr(inode, bindex) ++ || au_ii_br_id(inode, bindex) != br_id) ++ continue; ++ au_set_h_iptr(inode, bindex, NULL, 0); ++ do_put = 1; ++ break; ++ } ++ if (do_put) ++ for (bindex = btop; bindex <= bbot; bindex++) ++ if (au_h_iptr(inode, bindex)) { ++ do_put = 0; ++ break; ++ } ++ } else ++ do_put = 1; ++ ++ return do_put; ++} ++ ++/* free the plinks on a branch specified by @br_id */ ++void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id) ++{ ++ struct au_sbinfo *sbinfo; ++ struct hlist_head *plink_hlist; ++ struct hlist_node *tmp; ++ struct au_icntnr *icntnr; ++ struct inode *inode; ++ int i, do_put; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); ++ AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); ++ ++ /* no spin_lock since sbinfo is write-locked */ ++ for (i = 0; i < AuPlink_NHASH; i++) { ++ plink_hlist = &sbinfo->si_plink[i].head; ++ hlist_for_each_entry_safe(icntnr, tmp, plink_hlist, plink) { ++ inode = au_igrab(&icntnr->vfs_inode); ++ ii_write_lock_child(inode); ++ do_put = au_plink_do_half_refresh(inode, br_id); ++ if (do_put) { ++ hlist_del(&icntnr->plink); ++ iput(inode); ++ } ++ ii_write_unlock(inode); ++ iput(inode); ++ } ++ } ++} +diff --git a/fs/aufs/poll.c b/fs/aufs/poll.c +new file mode 100644 +index 0000000..720b2ed +--- /dev/null ++++ b/fs/aufs/poll.c +@@ -0,0 +1,52 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * poll operation ++ * There is only one filesystem which implements ->poll operation, currently. ++ */ ++ ++#include "aufs.h" ++ ++unsigned int aufs_poll(struct file *file, poll_table *wait) ++{ ++ unsigned int mask; ++ int err; ++ struct file *h_file; ++ struct super_block *sb; ++ ++ /* We should pretend an error happened. */ ++ mask = POLLERR /* | POLLIN | POLLOUT */; ++ sb = file->f_path.dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ ++ h_file = au_read_pre(file, /*keep_fi*/0); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; ++ ++ /* it is not an error if h_file has no operation */ ++ mask = DEFAULT_POLLMASK; ++ if (h_file->f_op->poll) ++ mask = h_file->f_op->poll(h_file, wait); ++ fput(h_file); /* instead of au_read_post() */ ++ ++out: ++ si_read_unlock(sb); ++ AuTraceErr((int)mask); ++ return mask; ++} +diff --git a/fs/aufs/posix_acl.c b/fs/aufs/posix_acl.c +new file mode 100644 +index 0000000..c44f63c +--- /dev/null ++++ b/fs/aufs/posix_acl.c +@@ -0,0 +1,98 @@ ++/* ++ * Copyright (C) 2014-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * posix acl operations ++ */ ++ ++#include ++#include "aufs.h" ++ ++struct posix_acl *aufs_get_acl(struct inode *inode, int type) ++{ ++ struct posix_acl *acl; ++ int err; ++ aufs_bindex_t bindex; ++ struct inode *h_inode; ++ struct super_block *sb; ++ ++ acl = NULL; ++ sb = inode->i_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ ii_read_lock_child(inode); ++ if (!(sb->s_flags & MS_POSIXACL)) ++ goto out; ++ ++ bindex = au_ibtop(inode); ++ h_inode = au_h_iptr(inode, bindex); ++ if (unlikely(!h_inode ++ || ((h_inode->i_mode & S_IFMT) ++ != (inode->i_mode & S_IFMT)))) { ++ err = au_busy_or_stale(); ++ acl = ERR_PTR(err); ++ goto out; ++ } ++ ++ /* always topmost only */ ++ acl = get_acl(h_inode, type); ++ ++out: ++ ii_read_unlock(inode); ++ si_read_unlock(sb); ++ ++ AuTraceErrPtr(acl); ++ return acl; ++} ++ ++int aufs_set_acl(struct inode *inode, struct posix_acl *acl, int type) ++{ ++ int err; ++ ssize_t ssz; ++ struct dentry *dentry; ++ struct au_srxattr arg = { ++ .type = AU_ACL_SET, ++ .u.acl_set = { ++ .acl = acl, ++ .type = type ++ }, ++ }; ++ ++ inode_lock(inode); ++ if (inode->i_ino == AUFS_ROOT_INO) ++ dentry = dget(inode->i_sb->s_root); ++ else { ++ dentry = d_find_alias(inode); ++ if (!dentry) ++ dentry = d_find_any_alias(inode); ++ if (!dentry) { ++ pr_warn("cannot handle this inode, " ++ "please report to aufs-users ML\n"); ++ err = -ENOENT; ++ goto out; ++ } ++ } ++ ++ ssz = au_srxattr(dentry, &arg); ++ dput(dentry); ++ err = ssz; ++ if (ssz >= 0) ++ err = 0; ++ ++out: ++ inode_unlock(inode); ++ return err; ++} +diff --git a/fs/aufs/procfs.c b/fs/aufs/procfs.c +new file mode 100644 +index 0000000..a334330 +--- /dev/null ++++ b/fs/aufs/procfs.c +@@ -0,0 +1,169 @@ ++/* ++ * Copyright (C) 2010-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * procfs interfaces ++ */ ++ ++#include ++#include "aufs.h" ++ ++static int au_procfs_plm_release(struct inode *inode, struct file *file) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ sbinfo = file->private_data; ++ if (sbinfo) { ++ au_plink_maint_leave(sbinfo); ++ kobject_put(&sbinfo->si_kobj); ++ } ++ ++ return 0; ++} ++ ++static void au_procfs_plm_write_clean(struct file *file) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ sbinfo = file->private_data; ++ if (sbinfo) ++ au_plink_clean(sbinfo->si_sb, /*verbose*/0); ++} ++ ++static int au_procfs_plm_write_si(struct file *file, unsigned long id) ++{ ++ int err; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ ++ err = -EBUSY; ++ if (unlikely(file->private_data)) ++ goto out; ++ ++ sb = NULL; ++ /* don't use au_sbilist_lock() here */ ++ spin_lock(&au_sbilist.spin); ++ hlist_for_each_entry(sbinfo, &au_sbilist.head, si_list) ++ if (id == sysaufs_si_id(sbinfo)) { ++ kobject_get(&sbinfo->si_kobj); ++ sb = sbinfo->si_sb; ++ break; ++ } ++ spin_unlock(&au_sbilist.spin); ++ ++ err = -EINVAL; ++ if (unlikely(!sb)) ++ goto out; ++ ++ err = au_plink_maint_enter(sb); ++ if (!err) ++ /* keep kobject_get() */ ++ file->private_data = sbinfo; ++ else ++ kobject_put(&sbinfo->si_kobj); ++out: ++ return err; ++} ++ ++/* ++ * Accept a valid "si=xxxx" only. ++ * Once it is accepted successfully, accept "clean" too. ++ */ ++static ssize_t au_procfs_plm_write(struct file *file, const char __user *ubuf, ++ size_t count, loff_t *ppos) ++{ ++ ssize_t err; ++ unsigned long id; ++ /* last newline is allowed */ ++ char buf[3 + sizeof(unsigned long) * 2 + 1]; ++ ++ err = -EACCES; ++ if (unlikely(!capable(CAP_SYS_ADMIN))) ++ goto out; ++ ++ err = -EINVAL; ++ if (unlikely(count > sizeof(buf))) ++ goto out; ++ ++ err = copy_from_user(buf, ubuf, count); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ goto out; ++ } ++ buf[count] = 0; ++ ++ err = -EINVAL; ++ if (!strcmp("clean", buf)) { ++ au_procfs_plm_write_clean(file); ++ goto out_success; ++ } else if (unlikely(strncmp("si=", buf, 3))) ++ goto out; ++ ++ err = kstrtoul(buf + 3, 16, &id); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_procfs_plm_write_si(file, id); ++ if (unlikely(err)) ++ goto out; ++ ++out_success: ++ err = count; /* success */ ++out: ++ return err; ++} ++ ++static const struct file_operations au_procfs_plm_fop = { ++ .write = au_procfs_plm_write, ++ .release = au_procfs_plm_release, ++ .owner = THIS_MODULE ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct proc_dir_entry *au_procfs_dir; ++ ++void au_procfs_fin(void) ++{ ++ remove_proc_entry(AUFS_PLINK_MAINT_NAME, au_procfs_dir); ++ remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL); ++} ++ ++int __init au_procfs_init(void) ++{ ++ int err; ++ struct proc_dir_entry *entry; ++ ++ err = -ENOMEM; ++ au_procfs_dir = proc_mkdir(AUFS_PLINK_MAINT_DIR, NULL); ++ if (unlikely(!au_procfs_dir)) ++ goto out; ++ ++ entry = proc_create(AUFS_PLINK_MAINT_NAME, S_IFREG | S_IWUSR, ++ au_procfs_dir, &au_procfs_plm_fop); ++ if (unlikely(!entry)) ++ goto out_dir; ++ ++ err = 0; ++ goto out; /* success */ ++ ++ ++out_dir: ++ remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL); ++out: ++ return err; ++} +diff --git a/fs/aufs/rdu.c b/fs/aufs/rdu.c +new file mode 100644 +index 0000000..3774f4e +--- /dev/null ++++ b/fs/aufs/rdu.c +@@ -0,0 +1,389 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * readdir in userspace. ++ */ ++ ++#include ++#include ++#include ++#include "aufs.h" ++ ++/* bits for struct aufs_rdu.flags */ ++#define AuRdu_CALLED 1 ++#define AuRdu_CONT (1 << 1) ++#define AuRdu_FULL (1 << 2) ++#define au_ftest_rdu(flags, name) ((flags) & AuRdu_##name) ++#define au_fset_rdu(flags, name) \ ++ do { (flags) |= AuRdu_##name; } while (0) ++#define au_fclr_rdu(flags, name) \ ++ do { (flags) &= ~AuRdu_##name; } while (0) ++ ++struct au_rdu_arg { ++ struct dir_context ctx; ++ struct aufs_rdu *rdu; ++ union au_rdu_ent_ul ent; ++ unsigned long end; ++ ++ struct super_block *sb; ++ int err; ++}; ++ ++static int au_rdu_fill(struct dir_context *ctx, const char *name, int nlen, ++ loff_t offset, u64 h_ino, unsigned int d_type) ++{ ++ int err, len; ++ struct au_rdu_arg *arg = container_of(ctx, struct au_rdu_arg, ctx); ++ struct aufs_rdu *rdu = arg->rdu; ++ struct au_rdu_ent ent; ++ ++ err = 0; ++ arg->err = 0; ++ au_fset_rdu(rdu->cookie.flags, CALLED); ++ len = au_rdu_len(nlen); ++ if (arg->ent.ul + len < arg->end) { ++ ent.ino = h_ino; ++ ent.bindex = rdu->cookie.bindex; ++ ent.type = d_type; ++ ent.nlen = nlen; ++ if (unlikely(nlen > AUFS_MAX_NAMELEN)) ++ ent.type = DT_UNKNOWN; ++ ++ /* unnecessary to support mmap_sem since this is a dir */ ++ err = -EFAULT; ++ if (copy_to_user(arg->ent.e, &ent, sizeof(ent))) ++ goto out; ++ if (copy_to_user(arg->ent.e->name, name, nlen)) ++ goto out; ++ /* the terminating NULL */ ++ if (__put_user(0, arg->ent.e->name + nlen)) ++ goto out; ++ err = 0; ++ /* AuDbg("%p, %.*s\n", arg->ent.p, nlen, name); */ ++ arg->ent.ul += len; ++ rdu->rent++; ++ } else { ++ err = -EFAULT; ++ au_fset_rdu(rdu->cookie.flags, FULL); ++ rdu->full = 1; ++ rdu->tail = arg->ent; ++ } ++ ++out: ++ /* AuTraceErr(err); */ ++ return err; ++} ++ ++static int au_rdu_do(struct file *h_file, struct au_rdu_arg *arg) ++{ ++ int err; ++ loff_t offset; ++ struct au_rdu_cookie *cookie = &arg->rdu->cookie; ++ ++ /* we don't have to care (FMODE_32BITHASH | FMODE_64BITHASH) for ext4 */ ++ offset = vfsub_llseek(h_file, cookie->h_pos, SEEK_SET); ++ err = offset; ++ if (unlikely(offset != cookie->h_pos)) ++ goto out; ++ ++ err = 0; ++ do { ++ arg->err = 0; ++ au_fclr_rdu(cookie->flags, CALLED); ++ /* smp_mb(); */ ++ err = vfsub_iterate_dir(h_file, &arg->ctx); ++ if (err >= 0) ++ err = arg->err; ++ } while (!err ++ && au_ftest_rdu(cookie->flags, CALLED) ++ && !au_ftest_rdu(cookie->flags, FULL)); ++ cookie->h_pos = h_file->f_pos; ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int au_rdu(struct file *file, struct aufs_rdu *rdu) ++{ ++ int err; ++ aufs_bindex_t bbot; ++ struct au_rdu_arg arg = { ++ .ctx = { ++ .actor = au_rdu_fill ++ } ++ }; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct file *h_file; ++ struct au_rdu_cookie *cookie = &rdu->cookie; ++ ++ err = !access_ok(VERIFY_WRITE, rdu->ent.e, rdu->sz); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ goto out; ++ } ++ rdu->rent = 0; ++ rdu->tail = rdu->ent; ++ rdu->full = 0; ++ arg.rdu = rdu; ++ arg.ent = rdu->ent; ++ arg.end = arg.ent.ul; ++ arg.end += rdu->sz; ++ ++ err = -ENOTDIR; ++ if (unlikely(!file->f_op->iterate)) ++ goto out; ++ ++ err = security_file_permission(file, MAY_READ); ++ AuTraceErr(err); ++ if (unlikely(err)) ++ goto out; ++ ++ dentry = file->f_path.dentry; ++ inode = d_inode(dentry); ++#if 1 ++ inode_lock(inode); ++#else ++ /* todo: create a new inline func inode_lock_killable() */ ++ err = mutex_lock_killable(&inode->i_mutex); ++ AuTraceErr(err); ++ if (unlikely(err)) ++ goto out; ++#endif ++ ++ arg.sb = inode->i_sb; ++ err = si_read_lock(arg.sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (unlikely(err)) ++ goto out_mtx; ++ err = au_alive_dir(dentry); ++ if (unlikely(err)) ++ goto out_si; ++ /* todo: reval? */ ++ fi_read_lock(file); ++ ++ err = -EAGAIN; ++ if (unlikely(au_ftest_rdu(cookie->flags, CONT) ++ && cookie->generation != au_figen(file))) ++ goto out_unlock; ++ ++ err = 0; ++ if (!rdu->blk) { ++ rdu->blk = au_sbi(arg.sb)->si_rdblk; ++ if (!rdu->blk) ++ rdu->blk = au_dir_size(file, /*dentry*/NULL); ++ } ++ bbot = au_fbtop(file); ++ if (cookie->bindex < bbot) ++ cookie->bindex = bbot; ++ bbot = au_fbbot_dir(file); ++ /* AuDbg("b%d, b%d\n", cookie->bindex, bbot); */ ++ for (; !err && cookie->bindex <= bbot; ++ cookie->bindex++, cookie->h_pos = 0) { ++ h_file = au_hf_dir(file, cookie->bindex); ++ if (!h_file) ++ continue; ++ ++ au_fclr_rdu(cookie->flags, FULL); ++ err = au_rdu_do(h_file, &arg); ++ AuTraceErr(err); ++ if (unlikely(au_ftest_rdu(cookie->flags, FULL) || err)) ++ break; ++ } ++ AuDbg("rent %llu\n", rdu->rent); ++ ++ if (!err && !au_ftest_rdu(cookie->flags, CONT)) { ++ rdu->shwh = !!au_opt_test(au_sbi(arg.sb)->si_mntflags, SHWH); ++ au_fset_rdu(cookie->flags, CONT); ++ cookie->generation = au_figen(file); ++ } ++ ++ ii_read_lock_child(inode); ++ fsstack_copy_attr_atime(inode, au_h_iptr(inode, au_ibtop(inode))); ++ ii_read_unlock(inode); ++ ++out_unlock: ++ fi_read_unlock(file); ++out_si: ++ si_read_unlock(arg.sb); ++out_mtx: ++ inode_unlock(inode); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int au_rdu_ino(struct file *file, struct aufs_rdu *rdu) ++{ ++ int err; ++ ino_t ino; ++ unsigned long long nent; ++ union au_rdu_ent_ul *u; ++ struct au_rdu_ent ent; ++ struct super_block *sb; ++ ++ err = 0; ++ nent = rdu->nent; ++ u = &rdu->ent; ++ sb = file->f_path.dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ while (nent-- > 0) { ++ /* unnecessary to support mmap_sem since this is a dir */ ++ err = copy_from_user(&ent, u->e, sizeof(ent)); ++ if (!err) ++ err = !access_ok(VERIFY_WRITE, &u->e->ino, sizeof(ino)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ break; ++ } ++ ++ /* AuDbg("b%d, i%llu\n", ent.bindex, ent.ino); */ ++ if (!ent.wh) ++ err = au_ino(sb, ent.bindex, ent.ino, ent.type, &ino); ++ else ++ err = au_wh_ino(sb, ent.bindex, ent.ino, ent.type, ++ &ino); ++ if (unlikely(err)) { ++ AuTraceErr(err); ++ break; ++ } ++ ++ err = __put_user(ino, &u->e->ino); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ break; ++ } ++ u->ul += au_rdu_len(ent.nlen); ++ } ++ si_read_unlock(sb); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_rdu_verify(struct aufs_rdu *rdu) ++{ ++ AuDbg("rdu{%llu, %p, %u | %u | %llu, %u, %u | " ++ "%llu, b%d, 0x%x, g%u}\n", ++ rdu->sz, rdu->ent.e, rdu->verify[AufsCtlRduV_SZ], ++ rdu->blk, ++ rdu->rent, rdu->shwh, rdu->full, ++ rdu->cookie.h_pos, rdu->cookie.bindex, rdu->cookie.flags, ++ rdu->cookie.generation); ++ ++ if (rdu->verify[AufsCtlRduV_SZ] == sizeof(*rdu)) ++ return 0; ++ ++ AuDbg("%u:%u\n", ++ rdu->verify[AufsCtlRduV_SZ], (unsigned int)sizeof(*rdu)); ++ return -EINVAL; ++} ++ ++long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ long err, e; ++ struct aufs_rdu rdu; ++ void __user *p = (void __user *)arg; ++ ++ err = copy_from_user(&rdu, p, sizeof(rdu)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ goto out; ++ } ++ err = au_rdu_verify(&rdu); ++ if (unlikely(err)) ++ goto out; ++ ++ switch (cmd) { ++ case AUFS_CTL_RDU: ++ err = au_rdu(file, &rdu); ++ if (unlikely(err)) ++ break; ++ ++ e = copy_to_user(p, &rdu, sizeof(rdu)); ++ if (unlikely(e)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ } ++ break; ++ case AUFS_CTL_RDU_INO: ++ err = au_rdu_ino(file, &rdu); ++ break; ++ ++ default: ++ /* err = -ENOTTY; */ ++ err = -EINVAL; ++ } ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++#ifdef CONFIG_COMPAT ++long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ long err, e; ++ struct aufs_rdu rdu; ++ void __user *p = compat_ptr(arg); ++ ++ /* todo: get_user()? */ ++ err = copy_from_user(&rdu, p, sizeof(rdu)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ goto out; ++ } ++ rdu.ent.e = compat_ptr(rdu.ent.ul); ++ err = au_rdu_verify(&rdu); ++ if (unlikely(err)) ++ goto out; ++ ++ switch (cmd) { ++ case AUFS_CTL_RDU: ++ err = au_rdu(file, &rdu); ++ if (unlikely(err)) ++ break; ++ ++ rdu.ent.ul = ptr_to_compat(rdu.ent.e); ++ rdu.tail.ul = ptr_to_compat(rdu.tail.e); ++ e = copy_to_user(p, &rdu, sizeof(rdu)); ++ if (unlikely(e)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ } ++ break; ++ case AUFS_CTL_RDU_INO: ++ err = au_rdu_ino(file, &rdu); ++ break; ++ ++ default: ++ /* err = -ENOTTY; */ ++ err = -EINVAL; ++ } ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++#endif +diff --git a/fs/aufs/rwsem.h b/fs/aufs/rwsem.h +new file mode 100644 +index 0000000..678fe6f +--- /dev/null ++++ b/fs/aufs/rwsem.h +@@ -0,0 +1,198 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * simple read-write semaphore wrappers ++ */ ++ ++#ifndef __AUFS_RWSEM_H__ ++#define __AUFS_RWSEM_H__ ++ ++#ifdef __KERNEL__ ++ ++#include "debug.h" ++ ++struct au_rwsem { ++ struct rw_semaphore rwsem; ++#ifdef CONFIG_AUFS_DEBUG ++ /* just for debugging, not almighty counter */ ++ atomic_t rcnt, wcnt; ++#endif ++}; ++ ++#ifdef CONFIG_LOCKDEP ++#define au_lockdep_set_name(rw) \ ++ lockdep_set_class_and_name(&(rw)->rwsem, \ ++ /*original key*/(rw)->rwsem.dep_map.key, \ ++ /*name*/#rw) ++#else ++#define au_lockdep_set_name(rw) do {} while (0) ++#endif ++ ++#ifdef CONFIG_AUFS_DEBUG ++#define AuDbgCntInit(rw) do { \ ++ atomic_set(&(rw)->rcnt, 0); \ ++ atomic_set(&(rw)->wcnt, 0); \ ++ smp_mb(); /* atomic set */ \ ++} while (0) ++ ++#define AuDbgCnt(rw, cnt) atomic_read(&(rw)->cnt) ++#define AuDbgCntInc(rw, cnt) atomic_inc(&(rw)->cnt) ++#define AuDbgCntDec(rw, cnt) WARN_ON(atomic_dec_return(&(rw)->cnt) < 0) ++#define AuDbgRcntInc(rw) AuDbgCntInc(rw, rcnt) ++#define AuDbgRcntDec(rw) AuDbgCntDec(rw, rcnt) ++#define AuDbgWcntInc(rw) AuDbgCntInc(rw, wcnt) ++#define AuDbgWcntDec(rw) AuDbgCntDec(rw, wcnt) ++#else ++#define AuDbgCnt(rw, cnt) 0 ++#define AuDbgCntInit(rw) do {} while (0) ++#define AuDbgRcntInc(rw) do {} while (0) ++#define AuDbgRcntDec(rw) do {} while (0) ++#define AuDbgWcntInc(rw) do {} while (0) ++#define AuDbgWcntDec(rw) do {} while (0) ++#endif /* CONFIG_AUFS_DEBUG */ ++ ++/* to debug easier, do not make them inlined functions */ ++#define AuRwMustNoWaiters(rw) AuDebugOn(rwsem_is_contended(&(rw)->rwsem)) ++/* rwsem_is_locked() is unusable */ ++#define AuRwMustReadLock(rw) AuDebugOn(AuDbgCnt(rw, rcnt) <= 0) ++#define AuRwMustWriteLock(rw) AuDebugOn(AuDbgCnt(rw, wcnt) <= 0) ++#define AuRwMustAnyLock(rw) AuDebugOn(AuDbgCnt(rw, rcnt) <= 0 \ ++ && AuDbgCnt(rw, wcnt) <= 0) ++#define AuRwDestroy(rw) AuDebugOn(AuDbgCnt(rw, rcnt) \ ++ || AuDbgCnt(rw, wcnt)) ++ ++#define au_rw_init(rw) do { \ ++ AuDbgCntInit(rw); \ ++ init_rwsem(&(rw)->rwsem); \ ++ au_lockdep_set_name(rw); \ ++ } while (0) ++ ++#define au_rw_init_wlock(rw) do { \ ++ au_rw_init(rw); \ ++ down_write(&(rw)->rwsem); \ ++ AuDbgWcntInc(rw); \ ++ } while (0) ++ ++#define au_rw_init_wlock_nested(rw, lsc) do { \ ++ au_rw_init(rw); \ ++ down_write_nested(&(rw)->rwsem, lsc); \ ++ AuDbgWcntInc(rw); \ ++ } while (0) ++ ++static inline void au_rw_read_lock(struct au_rwsem *rw) ++{ ++ down_read(&rw->rwsem); ++ AuDbgRcntInc(rw); ++} ++ ++static inline void au_rw_read_lock_nested(struct au_rwsem *rw, unsigned int lsc) ++{ ++ down_read_nested(&rw->rwsem, lsc); ++ AuDbgRcntInc(rw); ++} ++ ++static inline void au_rw_read_unlock(struct au_rwsem *rw) ++{ ++ AuRwMustReadLock(rw); ++ AuDbgRcntDec(rw); ++ up_read(&rw->rwsem); ++} ++ ++static inline void au_rw_dgrade_lock(struct au_rwsem *rw) ++{ ++ AuRwMustWriteLock(rw); ++ AuDbgRcntInc(rw); ++ AuDbgWcntDec(rw); ++ downgrade_write(&rw->rwsem); ++} ++ ++static inline void au_rw_write_lock(struct au_rwsem *rw) ++{ ++ down_write(&rw->rwsem); ++ AuDbgWcntInc(rw); ++} ++ ++static inline void au_rw_write_lock_nested(struct au_rwsem *rw, ++ unsigned int lsc) ++{ ++ down_write_nested(&rw->rwsem, lsc); ++ AuDbgWcntInc(rw); ++} ++ ++static inline void au_rw_write_unlock(struct au_rwsem *rw) ++{ ++ AuRwMustWriteLock(rw); ++ AuDbgWcntDec(rw); ++ up_write(&rw->rwsem); ++} ++ ++/* why is not _nested version defined */ ++static inline int au_rw_read_trylock(struct au_rwsem *rw) ++{ ++ int ret; ++ ++ ret = down_read_trylock(&rw->rwsem); ++ if (ret) ++ AuDbgRcntInc(rw); ++ return ret; ++} ++ ++static inline int au_rw_write_trylock(struct au_rwsem *rw) ++{ ++ int ret; ++ ++ ret = down_write_trylock(&rw->rwsem); ++ if (ret) ++ AuDbgWcntInc(rw); ++ return ret; ++} ++ ++#undef AuDbgCntDec ++#undef AuDbgRcntInc ++#undef AuDbgRcntDec ++#undef AuDbgWcntDec ++ ++#define AuSimpleLockRwsemFuncs(prefix, param, rwsem) \ ++static inline void prefix##_read_lock(param) \ ++{ au_rw_read_lock(rwsem); } \ ++static inline void prefix##_write_lock(param) \ ++{ au_rw_write_lock(rwsem); } \ ++static inline int prefix##_read_trylock(param) \ ++{ return au_rw_read_trylock(rwsem); } \ ++static inline int prefix##_write_trylock(param) \ ++{ return au_rw_write_trylock(rwsem); } ++/* why is not _nested version defined */ ++/* static inline void prefix##_read_trylock_nested(param, lsc) ++{ au_rw_read_trylock_nested(rwsem, lsc)); } ++static inline void prefix##_write_trylock_nestd(param, lsc) ++{ au_rw_write_trylock_nested(rwsem, lsc); } */ ++ ++#define AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) \ ++static inline void prefix##_read_unlock(param) \ ++{ au_rw_read_unlock(rwsem); } \ ++static inline void prefix##_write_unlock(param) \ ++{ au_rw_write_unlock(rwsem); } \ ++static inline void prefix##_downgrade_lock(param) \ ++{ au_rw_dgrade_lock(rwsem); } ++ ++#define AuSimpleRwsemFuncs(prefix, param, rwsem) \ ++ AuSimpleLockRwsemFuncs(prefix, param, rwsem) \ ++ AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_RWSEM_H__ */ +diff --git a/fs/aufs/sbinfo.c b/fs/aufs/sbinfo.c +new file mode 100644 +index 0000000..3dd9fd6 +--- /dev/null ++++ b/fs/aufs/sbinfo.c +@@ -0,0 +1,353 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * superblock private data ++ */ ++ ++#include "aufs.h" ++ ++/* ++ * they are necessary regardless sysfs is disabled. ++ */ ++void au_si_free(struct kobject *kobj) ++{ ++ int i; ++ struct au_sbinfo *sbinfo; ++ char *locked __maybe_unused; /* debug only */ ++ ++ sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); ++ for (i = 0; i < AuPlink_NHASH; i++) ++ AuDebugOn(!hlist_empty(&sbinfo->si_plink[i].head)); ++ au_nwt_fin(&sbinfo->si_nowait); ++ ++ AuDebugOn(percpu_counter_sum(&sbinfo->si_ninodes)); ++ percpu_counter_destroy(&sbinfo->si_ninodes); ++ AuDebugOn(percpu_counter_sum(&sbinfo->si_nfiles)); ++ percpu_counter_destroy(&sbinfo->si_nfiles); ++ ++ au_rw_write_lock(&sbinfo->si_rwsem); ++ au_br_free(sbinfo); ++ au_rw_write_unlock(&sbinfo->si_rwsem); ++ ++ kfree(sbinfo->si_branch); ++ for (i = 0; i < AU_NPIDMAP; i++) ++ kfree(sbinfo->au_si_pid.pid_bitmap[i]); ++ mutex_destroy(&sbinfo->au_si_pid.pid_mtx); ++ mutex_destroy(&sbinfo->si_xib_mtx); ++ AuRwDestroy(&sbinfo->si_rwsem); ++ ++ kfree(sbinfo); ++} ++ ++int au_si_alloc(struct super_block *sb) ++{ ++ int err, i; ++ struct au_sbinfo *sbinfo; ++ ++ err = -ENOMEM; ++ sbinfo = kzalloc(sizeof(*sbinfo), GFP_NOFS); ++ if (unlikely(!sbinfo)) ++ goto out; ++ ++ /* will be reallocated separately */ ++ sbinfo->si_branch = kzalloc(sizeof(*sbinfo->si_branch), GFP_NOFS); ++ if (unlikely(!sbinfo->si_branch)) ++ goto out_sbinfo; ++ ++ err = sysaufs_si_init(sbinfo); ++ if (unlikely(err)) ++ goto out_br; ++ ++ au_nwt_init(&sbinfo->si_nowait); ++ au_rw_init_wlock(&sbinfo->si_rwsem); ++ mutex_init(&sbinfo->au_si_pid.pid_mtx); ++ ++ percpu_counter_init(&sbinfo->si_ninodes, 0, GFP_NOFS); ++ percpu_counter_init(&sbinfo->si_nfiles, 0, GFP_NOFS); ++ ++ sbinfo->si_bbot = -1; ++ sbinfo->si_last_br_id = AUFS_BRANCH_MAX / 2; ++ ++ sbinfo->si_wbr_copyup = AuWbrCopyup_Def; ++ sbinfo->si_wbr_create = AuWbrCreate_Def; ++ sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + sbinfo->si_wbr_copyup; ++ sbinfo->si_wbr_create_ops = au_wbr_create_ops + sbinfo->si_wbr_create; ++ ++ au_fhsm_init(sbinfo); ++ ++ sbinfo->si_mntflags = au_opts_plink(AuOpt_Def); ++ ++ sbinfo->si_xino_jiffy = jiffies; ++ sbinfo->si_xino_expire ++ = msecs_to_jiffies(AUFS_XINO_DEF_SEC * MSEC_PER_SEC); ++ mutex_init(&sbinfo->si_xib_mtx); ++ sbinfo->si_xino_brid = -1; ++ /* leave si_xib_last_pindex and si_xib_next_bit */ ++ ++ au_sphl_init(&sbinfo->si_aopen); ++ ++ sbinfo->si_rdcache = msecs_to_jiffies(AUFS_RDCACHE_DEF * MSEC_PER_SEC); ++ sbinfo->si_rdblk = AUFS_RDBLK_DEF; ++ sbinfo->si_rdhash = AUFS_RDHASH_DEF; ++ sbinfo->si_dirwh = AUFS_DIRWH_DEF; ++ ++ for (i = 0; i < AuPlink_NHASH; i++) ++ au_sphl_init(sbinfo->si_plink + i); ++ init_waitqueue_head(&sbinfo->si_plink_wq); ++ spin_lock_init(&sbinfo->si_plink_maint_lock); ++ ++ au_sphl_init(&sbinfo->si_files); ++ ++ /* with getattr by default */ ++ sbinfo->si_iop_array = aufs_iop; ++ ++ /* leave other members for sysaufs and si_mnt. */ ++ sbinfo->si_sb = sb; ++ sb->s_fs_info = sbinfo; ++ si_pid_set(sb); ++ return 0; /* success */ ++ ++out_br: ++ kfree(sbinfo->si_branch); ++out_sbinfo: ++ kfree(sbinfo); ++out: ++ return err; ++} ++ ++int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr) ++{ ++ int err, sz; ++ struct au_branch **brp; ++ ++ AuRwMustWriteLock(&sbinfo->si_rwsem); ++ ++ err = -ENOMEM; ++ sz = sizeof(*brp) * (sbinfo->si_bbot + 1); ++ if (unlikely(!sz)) ++ sz = sizeof(*brp); ++ brp = au_kzrealloc(sbinfo->si_branch, sz, sizeof(*brp) * nbr, GFP_NOFS); ++ if (brp) { ++ sbinfo->si_branch = brp; ++ err = 0; ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++unsigned int au_sigen_inc(struct super_block *sb) ++{ ++ unsigned int gen; ++ struct inode *inode; ++ ++ SiMustWriteLock(sb); ++ ++ gen = ++au_sbi(sb)->si_generation; ++ au_update_digen(sb->s_root); ++ inode = d_inode(sb->s_root); ++ au_update_iigen(inode, /*half*/0); ++ inode->i_version++; ++ return gen; ++} ++ ++aufs_bindex_t au_new_br_id(struct super_block *sb) ++{ ++ aufs_bindex_t br_id; ++ int i; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ for (i = 0; i <= AUFS_BRANCH_MAX; i++) { ++ br_id = ++sbinfo->si_last_br_id; ++ AuDebugOn(br_id < 0); ++ if (br_id && au_br_index(sb, br_id) < 0) ++ return br_id; ++ } ++ ++ return -1; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* it is ok that new 'nwt' tasks are appended while we are sleeping */ ++int si_read_lock(struct super_block *sb, int flags) ++{ ++ int err; ++ ++ err = 0; ++ if (au_ftest_lock(flags, FLUSH)) ++ au_nwt_flush(&au_sbi(sb)->si_nowait); ++ ++ si_noflush_read_lock(sb); ++ err = au_plink_maint(sb, flags); ++ if (unlikely(err)) ++ si_read_unlock(sb); ++ ++ return err; ++} ++ ++int si_write_lock(struct super_block *sb, int flags) ++{ ++ int err; ++ ++ if (au_ftest_lock(flags, FLUSH)) ++ au_nwt_flush(&au_sbi(sb)->si_nowait); ++ ++ si_noflush_write_lock(sb); ++ err = au_plink_maint(sb, flags); ++ if (unlikely(err)) ++ si_write_unlock(sb); ++ ++ return err; ++} ++ ++/* dentry and super_block lock. call at entry point */ ++int aufs_read_lock(struct dentry *dentry, int flags) ++{ ++ int err; ++ struct super_block *sb; ++ ++ sb = dentry->d_sb; ++ err = si_read_lock(sb, flags); ++ if (unlikely(err)) ++ goto out; ++ ++ if (au_ftest_lock(flags, DW)) ++ di_write_lock_child(dentry); ++ else ++ di_read_lock_child(dentry, flags); ++ ++ if (au_ftest_lock(flags, GEN)) { ++ err = au_digen_test(dentry, au_sigen(sb)); ++ if (!au_opt_test(au_mntflags(sb), UDBA_NONE)) ++ AuDebugOn(!err && au_dbrange_test(dentry)); ++ else if (!err) ++ err = au_dbrange_test(dentry); ++ if (unlikely(err)) ++ aufs_read_unlock(dentry, flags); ++ } ++ ++out: ++ return err; ++} ++ ++void aufs_read_unlock(struct dentry *dentry, int flags) ++{ ++ if (au_ftest_lock(flags, DW)) ++ di_write_unlock(dentry); ++ else ++ di_read_unlock(dentry, flags); ++ si_read_unlock(dentry->d_sb); ++} ++ ++void aufs_write_lock(struct dentry *dentry) ++{ ++ si_write_lock(dentry->d_sb, AuLock_FLUSH | AuLock_NOPLMW); ++ di_write_lock_child(dentry); ++} ++ ++void aufs_write_unlock(struct dentry *dentry) ++{ ++ di_write_unlock(dentry); ++ si_write_unlock(dentry->d_sb); ++} ++ ++int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags) ++{ ++ int err; ++ unsigned int sigen; ++ struct super_block *sb; ++ ++ sb = d1->d_sb; ++ err = si_read_lock(sb, flags); ++ if (unlikely(err)) ++ goto out; ++ ++ di_write_lock2_child(d1, d2, au_ftest_lock(flags, DIRS)); ++ ++ if (au_ftest_lock(flags, GEN)) { ++ sigen = au_sigen(sb); ++ err = au_digen_test(d1, sigen); ++ AuDebugOn(!err && au_dbrange_test(d1)); ++ if (!err) { ++ err = au_digen_test(d2, sigen); ++ AuDebugOn(!err && au_dbrange_test(d2)); ++ } ++ if (unlikely(err)) ++ aufs_read_and_write_unlock2(d1, d2); ++ } ++ ++out: ++ return err; ++} ++ ++void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2) ++{ ++ di_write_unlock2(d1, d2); ++ si_read_unlock(d1->d_sb); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void si_pid_alloc(struct au_si_pid *au_si_pid, int idx) ++{ ++ unsigned long *p; ++ ++ BUILD_BUG_ON(sizeof(unsigned long) != ++ sizeof(*au_si_pid->pid_bitmap)); ++ ++ mutex_lock(&au_si_pid->pid_mtx); ++ p = au_si_pid->pid_bitmap[idx]; ++ while (!p) { ++ /* ++ * bad approach. ++ * but keeping 'si_pid_set()' void is more important. ++ */ ++ p = kcalloc(BITS_TO_LONGS(AU_PIDSTEP), ++ sizeof(*au_si_pid->pid_bitmap), ++ GFP_NOFS); ++ if (p) ++ break; ++ cond_resched(); ++ } ++ au_si_pid->pid_bitmap[idx] = p; ++ mutex_unlock(&au_si_pid->pid_mtx); ++} ++ ++void si_pid_set(struct super_block *sb) ++{ ++ pid_t bit; ++ int idx; ++ unsigned long *bitmap; ++ struct au_si_pid *au_si_pid; ++ ++ si_pid_idx_bit(&idx, &bit); ++ au_si_pid = &au_sbi(sb)->au_si_pid; ++ bitmap = au_si_pid->pid_bitmap[idx]; ++ if (!bitmap) { ++ si_pid_alloc(au_si_pid, idx); ++ bitmap = au_si_pid->pid_bitmap[idx]; ++ } ++ AuDebugOn(test_bit(bit, bitmap)); ++ set_bit(bit, bitmap); ++ /* smp_mb(); */ ++} +diff --git a/fs/aufs/spl.h b/fs/aufs/spl.h +new file mode 100644 +index 0000000..945343a +--- /dev/null ++++ b/fs/aufs/spl.h +@@ -0,0 +1,111 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * simple list protected by a spinlock ++ */ ++ ++#ifndef __AUFS_SPL_H__ ++#define __AUFS_SPL_H__ ++ ++#ifdef __KERNEL__ ++ ++struct au_splhead { ++ spinlock_t spin; ++ struct list_head head; ++}; ++ ++static inline void au_spl_init(struct au_splhead *spl) ++{ ++ spin_lock_init(&spl->spin); ++ INIT_LIST_HEAD(&spl->head); ++} ++ ++static inline void au_spl_add(struct list_head *list, struct au_splhead *spl) ++{ ++ spin_lock(&spl->spin); ++ list_add(list, &spl->head); ++ spin_unlock(&spl->spin); ++} ++ ++static inline void au_spl_del(struct list_head *list, struct au_splhead *spl) ++{ ++ spin_lock(&spl->spin); ++ list_del(list); ++ spin_unlock(&spl->spin); ++} ++ ++static inline void au_spl_del_rcu(struct list_head *list, ++ struct au_splhead *spl) ++{ ++ spin_lock(&spl->spin); ++ list_del_rcu(list); ++ spin_unlock(&spl->spin); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_sphlhead { ++ spinlock_t spin; ++ struct hlist_head head; ++}; ++ ++static inline void au_sphl_init(struct au_sphlhead *sphl) ++{ ++ spin_lock_init(&sphl->spin); ++ INIT_HLIST_HEAD(&sphl->head); ++} ++ ++static inline void au_sphl_add(struct hlist_node *hlist, ++ struct au_sphlhead *sphl) ++{ ++ spin_lock(&sphl->spin); ++ hlist_add_head(hlist, &sphl->head); ++ spin_unlock(&sphl->spin); ++} ++ ++static inline void au_sphl_del(struct hlist_node *hlist, ++ struct au_sphlhead *sphl) ++{ ++ spin_lock(&sphl->spin); ++ hlist_del(hlist); ++ spin_unlock(&sphl->spin); ++} ++ ++static inline void au_sphl_del_rcu(struct hlist_node *hlist, ++ struct au_sphlhead *sphl) ++{ ++ spin_lock(&sphl->spin); ++ hlist_del_rcu(hlist); ++ spin_unlock(&sphl->spin); ++} ++ ++static inline unsigned long au_sphl_count(struct au_sphlhead *sphl) ++{ ++ unsigned long cnt; ++ struct hlist_node *pos; ++ ++ cnt = 0; ++ spin_lock(&sphl->spin); ++ hlist_for_each(pos, &sphl->head) ++ cnt++; ++ spin_unlock(&sphl->spin); ++ return cnt; ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_SPL_H__ */ +diff --git a/fs/aufs/super.c b/fs/aufs/super.c +new file mode 100644 +index 0000000..ff7e582 +--- /dev/null ++++ b/fs/aufs/super.c +@@ -0,0 +1,1039 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * mount and super_block operations ++ */ ++ ++#include ++#include ++#include ++#include ++#include "aufs.h" ++ ++/* ++ * super_operations ++ */ ++static struct inode *aufs_alloc_inode(struct super_block *sb __maybe_unused) ++{ ++ struct au_icntnr *c; ++ ++ c = au_cache_alloc_icntnr(); ++ if (c) { ++ au_icntnr_init(c); ++ c->vfs_inode.i_version = 1; /* sigen(sb); */ ++ return &c->vfs_inode; ++ } ++ return NULL; ++} ++ ++static void aufs_destroy_inode_cb(struct rcu_head *head) ++{ ++ struct inode *inode = container_of(head, struct inode, i_rcu); ++ ++ INIT_HLIST_HEAD(&inode->i_dentry); ++ au_cache_free_icntnr(container_of(inode, struct au_icntnr, vfs_inode)); ++} ++ ++static void aufs_destroy_inode(struct inode *inode) ++{ ++ if (!is_bad_inode(inode)) ++ au_iinfo_fin(inode); ++ call_rcu(&inode->i_rcu, aufs_destroy_inode_cb); ++} ++ ++struct inode *au_iget_locked(struct super_block *sb, ino_t ino) ++{ ++ struct inode *inode; ++ int err; ++ ++ inode = iget_locked(sb, ino); ++ if (unlikely(!inode)) { ++ inode = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ if (!(inode->i_state & I_NEW)) ++ goto out; ++ ++ err = au_xigen_new(inode); ++ if (!err) ++ err = au_iinfo_init(inode); ++ if (!err) ++ inode->i_version++; ++ else { ++ iget_failed(inode); ++ inode = ERR_PTR(err); ++ } ++ ++out: ++ /* never return NULL */ ++ AuDebugOn(!inode); ++ AuTraceErrPtr(inode); ++ return inode; ++} ++ ++/* lock free root dinfo */ ++static int au_show_brs(struct seq_file *seq, struct super_block *sb) ++{ ++ int err; ++ aufs_bindex_t bindex, bbot; ++ struct path path; ++ struct au_hdentry *hdp; ++ struct au_branch *br; ++ au_br_perm_str_t perm; ++ ++ err = 0; ++ bbot = au_sbbot(sb); ++ hdp = au_di(sb->s_root)->di_hdentry; ++ for (bindex = 0; !err && bindex <= bbot; bindex++) { ++ br = au_sbr(sb, bindex); ++ path.mnt = au_br_mnt(br); ++ path.dentry = hdp[bindex].hd_dentry; ++ err = au_seq_path(seq, &path); ++ if (!err) { ++ au_optstr_br_perm(&perm, br->br_perm); ++ seq_printf(seq, "=%s", perm.a); ++ if (bindex != bbot) ++ seq_putc(seq, ':'); ++ } ++ } ++ if (unlikely(err || seq_has_overflowed(seq))) ++ err = -E2BIG; ++ ++ return err; ++} ++ ++static void au_show_wbr_create(struct seq_file *m, int v, ++ struct au_sbinfo *sbinfo) ++{ ++ const char *pat; ++ ++ AuRwMustAnyLock(&sbinfo->si_rwsem); ++ ++ seq_puts(m, ",create="); ++ pat = au_optstr_wbr_create(v); ++ switch (v) { ++ case AuWbrCreate_TDP: ++ case AuWbrCreate_RR: ++ case AuWbrCreate_MFS: ++ case AuWbrCreate_PMFS: ++ seq_puts(m, pat); ++ break; ++ case AuWbrCreate_MFSV: ++ seq_printf(m, /*pat*/"mfs:%lu", ++ jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) ++ / MSEC_PER_SEC); ++ break; ++ case AuWbrCreate_PMFSV: ++ seq_printf(m, /*pat*/"pmfs:%lu", ++ jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) ++ / MSEC_PER_SEC); ++ break; ++ case AuWbrCreate_MFSRR: ++ seq_printf(m, /*pat*/"mfsrr:%llu", ++ sbinfo->si_wbr_mfs.mfsrr_watermark); ++ break; ++ case AuWbrCreate_MFSRRV: ++ seq_printf(m, /*pat*/"mfsrr:%llu:%lu", ++ sbinfo->si_wbr_mfs.mfsrr_watermark, ++ jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) ++ / MSEC_PER_SEC); ++ break; ++ case AuWbrCreate_PMFSRR: ++ seq_printf(m, /*pat*/"pmfsrr:%llu", ++ sbinfo->si_wbr_mfs.mfsrr_watermark); ++ break; ++ case AuWbrCreate_PMFSRRV: ++ seq_printf(m, /*pat*/"pmfsrr:%llu:%lu", ++ sbinfo->si_wbr_mfs.mfsrr_watermark, ++ jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) ++ / MSEC_PER_SEC); ++ break; ++ } ++} ++ ++static int au_show_xino(struct seq_file *seq, struct super_block *sb) ++{ ++#ifdef CONFIG_SYSFS ++ return 0; ++#else ++ int err; ++ const int len = sizeof(AUFS_XINO_FNAME) - 1; ++ aufs_bindex_t bindex, brid; ++ struct qstr *name; ++ struct file *f; ++ struct dentry *d, *h_root; ++ struct au_hdentry *hdp; ++ ++ AuRwMustAnyLock(&sbinfo->si_rwsem); ++ ++ err = 0; ++ f = au_sbi(sb)->si_xib; ++ if (!f) ++ goto out; ++ ++ /* stop printing the default xino path on the first writable branch */ ++ h_root = NULL; ++ brid = au_xino_brid(sb); ++ if (brid >= 0) { ++ bindex = au_br_index(sb, brid); ++ hdp = au_di(sb->s_root)->di_hdentry; ++ h_root = hdp[0 + bindex].hd_dentry; ++ } ++ d = f->f_path.dentry; ++ name = &d->d_name; ++ /* safe ->d_parent because the file is unlinked */ ++ if (d->d_parent == h_root ++ && name->len == len ++ && !memcmp(name->name, AUFS_XINO_FNAME, len)) ++ goto out; ++ ++ seq_puts(seq, ",xino="); ++ err = au_xino_path(seq, f); ++ ++out: ++ return err; ++#endif ++} ++ ++/* seq_file will re-call me in case of too long string */ ++static int aufs_show_options(struct seq_file *m, struct dentry *dentry) ++{ ++ int err; ++ unsigned int mnt_flags, v; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ ++#define AuBool(name, str) do { \ ++ v = au_opt_test(mnt_flags, name); \ ++ if (v != au_opt_test(AuOpt_Def, name)) \ ++ seq_printf(m, ",%s" #str, v ? "" : "no"); \ ++} while (0) ++ ++#define AuStr(name, str) do { \ ++ v = mnt_flags & AuOptMask_##name; \ ++ if (v != (AuOpt_Def & AuOptMask_##name)) \ ++ seq_printf(m, "," #str "=%s", au_optstr_##str(v)); \ ++} while (0) ++ ++#define AuUInt(name, str, val) do { \ ++ if (val != AUFS_##name##_DEF) \ ++ seq_printf(m, "," #str "=%u", val); \ ++} while (0) ++ ++ sb = dentry->d_sb; ++ if (sb->s_flags & MS_POSIXACL) ++ seq_puts(m, ",acl"); ++ ++ /* lock free root dinfo */ ++ si_noflush_read_lock(sb); ++ sbinfo = au_sbi(sb); ++ seq_printf(m, ",si=%lx", sysaufs_si_id(sbinfo)); ++ ++ mnt_flags = au_mntflags(sb); ++ if (au_opt_test(mnt_flags, XINO)) { ++ err = au_show_xino(m, sb); ++ if (unlikely(err)) ++ goto out; ++ } else ++ seq_puts(m, ",noxino"); ++ ++ AuBool(TRUNC_XINO, trunc_xino); ++ AuStr(UDBA, udba); ++ AuBool(SHWH, shwh); ++ AuBool(PLINK, plink); ++ AuBool(DIO, dio); ++ AuBool(DIRPERM1, dirperm1); ++ ++ v = sbinfo->si_wbr_create; ++ if (v != AuWbrCreate_Def) ++ au_show_wbr_create(m, v, sbinfo); ++ ++ v = sbinfo->si_wbr_copyup; ++ if (v != AuWbrCopyup_Def) ++ seq_printf(m, ",cpup=%s", au_optstr_wbr_copyup(v)); ++ ++ v = au_opt_test(mnt_flags, ALWAYS_DIROPQ); ++ if (v != au_opt_test(AuOpt_Def, ALWAYS_DIROPQ)) ++ seq_printf(m, ",diropq=%c", v ? 'a' : 'w'); ++ ++ AuUInt(DIRWH, dirwh, sbinfo->si_dirwh); ++ ++ v = jiffies_to_msecs(sbinfo->si_rdcache) / MSEC_PER_SEC; ++ AuUInt(RDCACHE, rdcache, v); ++ ++ AuUInt(RDBLK, rdblk, sbinfo->si_rdblk); ++ AuUInt(RDHASH, rdhash, sbinfo->si_rdhash); ++ ++ au_fhsm_show(m, sbinfo); ++ ++ AuBool(SUM, sum); ++ /* AuBool(SUM_W, wsum); */ ++ AuBool(WARN_PERM, warn_perm); ++ AuBool(VERBOSE, verbose); ++ ++out: ++ /* be sure to print "br:" last */ ++ if (!sysaufs_brs) { ++ seq_puts(m, ",br:"); ++ au_show_brs(m, sb); ++ } ++ si_read_unlock(sb); ++ return 0; ++ ++#undef AuBool ++#undef AuStr ++#undef AuUInt ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* sum mode which returns the summation for statfs(2) */ ++ ++static u64 au_add_till_max(u64 a, u64 b) ++{ ++ u64 old; ++ ++ old = a; ++ a += b; ++ if (old <= a) ++ return a; ++ return ULLONG_MAX; ++} ++ ++static u64 au_mul_till_max(u64 a, long mul) ++{ ++ u64 old; ++ ++ old = a; ++ a *= mul; ++ if (old <= a) ++ return a; ++ return ULLONG_MAX; ++} ++ ++static int au_statfs_sum(struct super_block *sb, struct kstatfs *buf) ++{ ++ int err; ++ long bsize, factor; ++ u64 blocks, bfree, bavail, files, ffree; ++ aufs_bindex_t bbot, bindex, i; ++ unsigned char shared; ++ struct path h_path; ++ struct super_block *h_sb; ++ ++ err = 0; ++ bsize = LONG_MAX; ++ files = 0; ++ ffree = 0; ++ blocks = 0; ++ bfree = 0; ++ bavail = 0; ++ bbot = au_sbbot(sb); ++ for (bindex = 0; bindex <= bbot; bindex++) { ++ h_path.mnt = au_sbr_mnt(sb, bindex); ++ h_sb = h_path.mnt->mnt_sb; ++ shared = 0; ++ for (i = 0; !shared && i < bindex; i++) ++ shared = (au_sbr_sb(sb, i) == h_sb); ++ if (shared) ++ continue; ++ ++ /* sb->s_root for NFS is unreliable */ ++ h_path.dentry = h_path.mnt->mnt_root; ++ err = vfs_statfs(&h_path, buf); ++ if (unlikely(err)) ++ goto out; ++ ++ if (bsize > buf->f_bsize) { ++ /* ++ * we will reduce bsize, so we have to expand blocks ++ * etc. to match them again ++ */ ++ factor = (bsize / buf->f_bsize); ++ blocks = au_mul_till_max(blocks, factor); ++ bfree = au_mul_till_max(bfree, factor); ++ bavail = au_mul_till_max(bavail, factor); ++ bsize = buf->f_bsize; ++ } ++ ++ factor = (buf->f_bsize / bsize); ++ blocks = au_add_till_max(blocks, ++ au_mul_till_max(buf->f_blocks, factor)); ++ bfree = au_add_till_max(bfree, ++ au_mul_till_max(buf->f_bfree, factor)); ++ bavail = au_add_till_max(bavail, ++ au_mul_till_max(buf->f_bavail, factor)); ++ files = au_add_till_max(files, buf->f_files); ++ ffree = au_add_till_max(ffree, buf->f_ffree); ++ } ++ ++ buf->f_bsize = bsize; ++ buf->f_blocks = blocks; ++ buf->f_bfree = bfree; ++ buf->f_bavail = bavail; ++ buf->f_files = files; ++ buf->f_ffree = ffree; ++ buf->f_frsize = 0; ++ ++out: ++ return err; ++} ++ ++static int aufs_statfs(struct dentry *dentry, struct kstatfs *buf) ++{ ++ int err; ++ struct path h_path; ++ struct super_block *sb; ++ ++ /* lock free root dinfo */ ++ sb = dentry->d_sb; ++ si_noflush_read_lock(sb); ++ if (!au_opt_test(au_mntflags(sb), SUM)) { ++ /* sb->s_root for NFS is unreliable */ ++ h_path.mnt = au_sbr_mnt(sb, 0); ++ h_path.dentry = h_path.mnt->mnt_root; ++ err = vfs_statfs(&h_path, buf); ++ } else ++ err = au_statfs_sum(sb, buf); ++ si_read_unlock(sb); ++ ++ if (!err) { ++ buf->f_type = AUFS_SUPER_MAGIC; ++ buf->f_namelen = AUFS_MAX_NAMELEN; ++ memset(&buf->f_fsid, 0, sizeof(buf->f_fsid)); ++ } ++ /* buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1; */ ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_sync_fs(struct super_block *sb, int wait) ++{ ++ int err, e; ++ aufs_bindex_t bbot, bindex; ++ struct au_branch *br; ++ struct super_block *h_sb; ++ ++ err = 0; ++ si_noflush_read_lock(sb); ++ bbot = au_sbbot(sb); ++ for (bindex = 0; bindex <= bbot; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (!au_br_writable(br->br_perm)) ++ continue; ++ ++ h_sb = au_sbr_sb(sb, bindex); ++ if (h_sb->s_op->sync_fs) { ++ e = h_sb->s_op->sync_fs(h_sb, wait); ++ if (unlikely(e && !err)) ++ err = e; ++ /* go on even if an error happens */ ++ } ++ } ++ si_read_unlock(sb); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* final actions when unmounting a file system */ ++static void aufs_put_super(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ sbinfo = au_sbi(sb); ++ if (!sbinfo) ++ return; ++ ++ dbgaufs_si_fin(sbinfo); ++ kobject_put(&sbinfo->si_kobj); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb, ++ struct super_block *sb, void *arg) ++{ ++ void *array; ++ unsigned long long n, sz; ++ ++ array = NULL; ++ n = 0; ++ if (!*hint) ++ goto out; ++ ++ if (*hint > ULLONG_MAX / sizeof(array)) { ++ array = ERR_PTR(-EMFILE); ++ pr_err("hint %llu\n", *hint); ++ goto out; ++ } ++ ++ sz = sizeof(array) * *hint; ++ array = kzalloc(sz, GFP_NOFS); ++ if (unlikely(!array)) ++ array = vzalloc(sz); ++ if (unlikely(!array)) { ++ array = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ n = cb(sb, array, *hint, arg); ++ AuDebugOn(n > *hint); ++ ++out: ++ *hint = n; ++ return array; ++} ++ ++static unsigned long long au_iarray_cb(struct super_block *sb, void *a, ++ unsigned long long max __maybe_unused, ++ void *arg) ++{ ++ unsigned long long n; ++ struct inode **p, *inode; ++ struct list_head *head; ++ ++ n = 0; ++ p = a; ++ head = arg; ++ spin_lock(&sb->s_inode_list_lock); ++ list_for_each_entry(inode, head, i_sb_list) { ++ if (!is_bad_inode(inode) ++ && au_ii(inode)->ii_btop >= 0) { ++ spin_lock(&inode->i_lock); ++ if (atomic_read(&inode->i_count)) { ++ au_igrab(inode); ++ *p++ = inode; ++ n++; ++ AuDebugOn(n > max); ++ } ++ spin_unlock(&inode->i_lock); ++ } ++ } ++ spin_unlock(&sb->s_inode_list_lock); ++ ++ return n; ++} ++ ++struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max) ++{ ++ *max = au_ninodes(sb); ++ return au_array_alloc(max, au_iarray_cb, sb, &sb->s_inodes); ++} ++ ++void au_iarray_free(struct inode **a, unsigned long long max) ++{ ++ unsigned long long ull; ++ ++ for (ull = 0; ull < max; ull++) ++ iput(a[ull]); ++ kvfree(a); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * refresh dentry and inode at remount time. ++ */ ++/* todo: consolidate with simple_reval_dpath() and au_reval_for_attr() */ ++static int au_do_refresh(struct dentry *dentry, unsigned int dir_flags, ++ struct dentry *parent) ++{ ++ int err; ++ ++ di_write_lock_child(dentry); ++ di_read_lock_parent(parent, AuLock_IR); ++ err = au_refresh_dentry(dentry, parent); ++ if (!err && dir_flags) ++ au_hn_reset(d_inode(dentry), dir_flags); ++ di_read_unlock(parent, AuLock_IR); ++ di_write_unlock(dentry); ++ ++ return err; ++} ++ ++static int au_do_refresh_d(struct dentry *dentry, unsigned int sigen, ++ struct au_sbinfo *sbinfo, ++ const unsigned int dir_flags, unsigned int do_idop) ++{ ++ int err; ++ struct dentry *parent; ++ ++ err = 0; ++ parent = dget_parent(dentry); ++ if (!au_digen_test(parent, sigen) && au_digen_test(dentry, sigen)) { ++ if (d_really_is_positive(dentry)) { ++ if (!d_is_dir(dentry)) ++ err = au_do_refresh(dentry, /*dir_flags*/0, ++ parent); ++ else { ++ err = au_do_refresh(dentry, dir_flags, parent); ++ if (unlikely(err)) ++ au_fset_si(sbinfo, FAILED_REFRESH_DIR); ++ } ++ } else ++ err = au_do_refresh(dentry, /*dir_flags*/0, parent); ++ AuDbgDentry(dentry); ++ } ++ dput(parent); ++ ++ if (!err) { ++ if (do_idop) ++ au_refresh_dop(dentry, /*force_reval*/0); ++ } else ++ au_refresh_dop(dentry, /*force_reval*/1); ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++static int au_refresh_d(struct super_block *sb, unsigned int do_idop) ++{ ++ int err, i, j, ndentry, e; ++ unsigned int sigen; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry **dentries, *d; ++ struct au_sbinfo *sbinfo; ++ struct dentry *root = sb->s_root; ++ const unsigned int dir_flags = au_hi_flags(d_inode(root), /*isdir*/1); ++ ++ if (do_idop) ++ au_refresh_dop(root, /*force_reval*/0); ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_dcsub_pages(&dpages, root, NULL, NULL); ++ if (unlikely(err)) ++ goto out_dpages; ++ ++ sigen = au_sigen(sb); ++ sbinfo = au_sbi(sb); ++ for (i = 0; i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ dentries = dpage->dentries; ++ ndentry = dpage->ndentry; ++ for (j = 0; j < ndentry; j++) { ++ d = dentries[j]; ++ e = au_do_refresh_d(d, sigen, sbinfo, dir_flags, ++ do_idop); ++ if (unlikely(e && !err)) ++ err = e; ++ /* go on even err */ ++ } ++ } ++ ++out_dpages: ++ au_dpages_free(&dpages); ++out: ++ return err; ++} ++ ++static int au_refresh_i(struct super_block *sb, unsigned int do_idop) ++{ ++ int err, e; ++ unsigned int sigen; ++ unsigned long long max, ull; ++ struct inode *inode, **array; ++ ++ array = au_iarray_alloc(sb, &max); ++ err = PTR_ERR(array); ++ if (IS_ERR(array)) ++ goto out; ++ ++ err = 0; ++ sigen = au_sigen(sb); ++ for (ull = 0; ull < max; ull++) { ++ inode = array[ull]; ++ if (unlikely(!inode)) ++ break; ++ ++ e = 0; ++ ii_write_lock_child(inode); ++ if (au_iigen(inode, NULL) != sigen) { ++ e = au_refresh_hinode_self(inode); ++ if (unlikely(e)) { ++ au_refresh_iop(inode, /*force_getattr*/1); ++ pr_err("error %d, i%lu\n", e, inode->i_ino); ++ if (!err) ++ err = e; ++ /* go on even if err */ ++ } ++ } ++ if (!e && do_idop) ++ au_refresh_iop(inode, /*force_getattr*/0); ++ ii_write_unlock(inode); ++ } ++ ++ au_iarray_free(array, max); ++ ++out: ++ return err; ++} ++ ++static void au_remount_refresh(struct super_block *sb, unsigned int do_idop) ++{ ++ int err, e; ++ unsigned int udba; ++ aufs_bindex_t bindex, bbot; ++ struct dentry *root; ++ struct inode *inode; ++ struct au_branch *br; ++ struct au_sbinfo *sbi; ++ ++ au_sigen_inc(sb); ++ sbi = au_sbi(sb); ++ au_fclr_si(sbi, FAILED_REFRESH_DIR); ++ ++ root = sb->s_root; ++ DiMustNoWaiters(root); ++ inode = d_inode(root); ++ IiMustNoWaiters(inode); ++ ++ udba = au_opt_udba(sb); ++ bbot = au_sbbot(sb); ++ for (bindex = 0; bindex <= bbot; bindex++) { ++ br = au_sbr(sb, bindex); ++ err = au_hnotify_reset_br(udba, br, br->br_perm); ++ if (unlikely(err)) ++ AuIOErr("hnotify failed on br %d, %d, ignored\n", ++ bindex, err); ++ /* go on even if err */ ++ } ++ au_hn_reset(inode, au_hi_flags(inode, /*isdir*/1)); ++ ++ if (do_idop) { ++ if (au_ftest_si(sbi, NO_DREVAL)) { ++ AuDebugOn(sb->s_d_op == &aufs_dop_noreval); ++ sb->s_d_op = &aufs_dop_noreval; ++ AuDebugOn(sbi->si_iop_array == aufs_iop_nogetattr); ++ sbi->si_iop_array = aufs_iop_nogetattr; ++ } else { ++ AuDebugOn(sb->s_d_op == &aufs_dop); ++ sb->s_d_op = &aufs_dop; ++ AuDebugOn(sbi->si_iop_array == aufs_iop); ++ sbi->si_iop_array = aufs_iop; ++ } ++ pr_info("reset to %pf and %pf\n", ++ sb->s_d_op, sbi->si_iop_array); ++ } ++ ++ di_write_unlock(root); ++ err = au_refresh_d(sb, do_idop); ++ e = au_refresh_i(sb, do_idop); ++ if (unlikely(e && !err)) ++ err = e; ++ /* aufs_write_lock() calls ..._child() */ ++ di_write_lock_child(root); ++ ++ au_cpup_attr_all(inode, /*force*/1); ++ ++ if (unlikely(err)) ++ AuIOErr("refresh failed, ignored, %d\n", err); ++} ++ ++/* stop extra interpretation of errno in mount(8), and strange error messages */ ++static int cvt_err(int err) ++{ ++ AuTraceErr(err); ++ ++ switch (err) { ++ case -ENOENT: ++ case -ENOTDIR: ++ case -EEXIST: ++ case -EIO: ++ err = -EINVAL; ++ } ++ return err; ++} ++ ++static int aufs_remount_fs(struct super_block *sb, int *flags, char *data) ++{ ++ int err, do_dx; ++ unsigned int mntflags; ++ struct au_opts opts = { ++ .opt = NULL ++ }; ++ struct dentry *root; ++ struct inode *inode; ++ struct au_sbinfo *sbinfo; ++ ++ err = 0; ++ root = sb->s_root; ++ if (!data || !*data) { ++ err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (!err) { ++ di_write_lock_child(root); ++ err = au_opts_verify(sb, *flags, /*pending*/0); ++ aufs_write_unlock(root); ++ } ++ goto out; ++ } ++ ++ err = -ENOMEM; ++ opts.opt = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!opts.opt)) ++ goto out; ++ opts.max_opt = PAGE_SIZE / sizeof(*opts.opt); ++ opts.flags = AuOpts_REMOUNT; ++ opts.sb_flags = *flags; ++ ++ /* parse it before aufs lock */ ++ err = au_opts_parse(sb, data, &opts); ++ if (unlikely(err)) ++ goto out_opts; ++ ++ sbinfo = au_sbi(sb); ++ inode = d_inode(root); ++ inode_lock(inode); ++ err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (unlikely(err)) ++ goto out_mtx; ++ di_write_lock_child(root); ++ ++ /* au_opts_remount() may return an error */ ++ err = au_opts_remount(sb, &opts); ++ au_opts_free(&opts); ++ ++ if (au_ftest_opts(opts.flags, REFRESH)) ++ au_remount_refresh(sb, au_ftest_opts(opts.flags, REFRESH_IDOP)); ++ ++ if (au_ftest_opts(opts.flags, REFRESH_DYAOP)) { ++ mntflags = au_mntflags(sb); ++ do_dx = !!au_opt_test(mntflags, DIO); ++ au_dy_arefresh(do_dx); ++ } ++ ++ au_fhsm_wrote_all(sb, /*force*/1); /* ?? */ ++ aufs_write_unlock(root); ++ ++out_mtx: ++ inode_unlock(inode); ++out_opts: ++ free_page((unsigned long)opts.opt); ++out: ++ err = cvt_err(err); ++ AuTraceErr(err); ++ return err; ++} ++ ++static const struct super_operations aufs_sop = { ++ .alloc_inode = aufs_alloc_inode, ++ .destroy_inode = aufs_destroy_inode, ++ /* always deleting, no clearing */ ++ .drop_inode = generic_delete_inode, ++ .show_options = aufs_show_options, ++ .statfs = aufs_statfs, ++ .put_super = aufs_put_super, ++ .sync_fs = aufs_sync_fs, ++ .remount_fs = aufs_remount_fs ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int alloc_root(struct super_block *sb) ++{ ++ int err; ++ struct inode *inode; ++ struct dentry *root; ++ ++ err = -ENOMEM; ++ inode = au_iget_locked(sb, AUFS_ROOT_INO); ++ err = PTR_ERR(inode); ++ if (IS_ERR(inode)) ++ goto out; ++ ++ inode->i_op = aufs_iop + AuIop_DIR; /* with getattr by default */ ++ inode->i_fop = &aufs_dir_fop; ++ inode->i_mode = S_IFDIR; ++ set_nlink(inode, 2); ++ unlock_new_inode(inode); ++ ++ root = d_make_root(inode); ++ if (unlikely(!root)) ++ goto out; ++ err = PTR_ERR(root); ++ if (IS_ERR(root)) ++ goto out; ++ ++ err = au_di_init(root); ++ if (!err) { ++ sb->s_root = root; ++ return 0; /* success */ ++ } ++ dput(root); ++ ++out: ++ return err; ++} ++ ++static int aufs_fill_super(struct super_block *sb, void *raw_data, ++ int silent __maybe_unused) ++{ ++ int err; ++ struct au_opts opts = { ++ .opt = NULL ++ }; ++ struct au_sbinfo *sbinfo; ++ struct dentry *root; ++ struct inode *inode; ++ char *arg = raw_data; ++ ++ if (unlikely(!arg || !*arg)) { ++ err = -EINVAL; ++ pr_err("no arg\n"); ++ goto out; ++ } ++ ++ err = -ENOMEM; ++ opts.opt = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!opts.opt)) ++ goto out; ++ opts.max_opt = PAGE_SIZE / sizeof(*opts.opt); ++ opts.sb_flags = sb->s_flags; ++ ++ err = au_si_alloc(sb); ++ if (unlikely(err)) ++ goto out_opts; ++ sbinfo = au_sbi(sb); ++ ++ /* all timestamps always follow the ones on the branch */ ++ sb->s_flags |= MS_NOATIME | MS_NODIRATIME; ++ sb->s_op = &aufs_sop; ++ sb->s_d_op = &aufs_dop; ++ sb->s_magic = AUFS_SUPER_MAGIC; ++ sb->s_maxbytes = 0; ++ sb->s_stack_depth = 1; ++ au_export_init(sb); ++ /* au_xattr_init(sb); */ ++ ++ err = alloc_root(sb); ++ if (unlikely(err)) { ++ si_write_unlock(sb); ++ goto out_info; ++ } ++ root = sb->s_root; ++ inode = d_inode(root); ++ ++ /* ++ * actually we can parse options regardless aufs lock here. ++ * but at remount time, parsing must be done before aufs lock. ++ * so we follow the same rule. ++ */ ++ ii_write_lock_parent(inode); ++ aufs_write_unlock(root); ++ err = au_opts_parse(sb, arg, &opts); ++ if (unlikely(err)) ++ goto out_root; ++ ++ /* lock vfs_inode first, then aufs. */ ++ inode_lock(inode); ++ aufs_write_lock(root); ++ err = au_opts_mount(sb, &opts); ++ au_opts_free(&opts); ++ if (!err && au_ftest_si(sbinfo, NO_DREVAL)) { ++ sb->s_d_op = &aufs_dop_noreval; ++ pr_info("%pf\n", sb->s_d_op); ++ au_refresh_dop(root, /*force_reval*/0); ++ sbinfo->si_iop_array = aufs_iop_nogetattr; ++ au_refresh_iop(inode, /*force_getattr*/0); ++ } ++ aufs_write_unlock(root); ++ inode_unlock(inode); ++ if (!err) ++ goto out_opts; /* success */ ++ ++out_root: ++ dput(root); ++ sb->s_root = NULL; ++out_info: ++ dbgaufs_si_fin(sbinfo); ++ kobject_put(&sbinfo->si_kobj); ++ sb->s_fs_info = NULL; ++out_opts: ++ free_page((unsigned long)opts.opt); ++out: ++ AuTraceErr(err); ++ err = cvt_err(err); ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct dentry *aufs_mount(struct file_system_type *fs_type, int flags, ++ const char *dev_name __maybe_unused, ++ void *raw_data) ++{ ++ struct dentry *root; ++ struct super_block *sb; ++ ++ /* all timestamps always follow the ones on the branch */ ++ /* mnt->mnt_flags |= MNT_NOATIME | MNT_NODIRATIME; */ ++ root = mount_nodev(fs_type, flags, raw_data, aufs_fill_super); ++ if (IS_ERR(root)) ++ goto out; ++ ++ sb = root->d_sb; ++ si_write_lock(sb, !AuLock_FLUSH); ++ sysaufs_brs_add(sb, 0); ++ si_write_unlock(sb); ++ au_sbilist_add(sb); ++ ++out: ++ return root; ++} ++ ++static void aufs_kill_sb(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ sbinfo = au_sbi(sb); ++ if (sbinfo) { ++ au_sbilist_del(sb); ++ aufs_write_lock(sb->s_root); ++ au_fhsm_fin(sb); ++ if (sbinfo->si_wbr_create_ops->fin) ++ sbinfo->si_wbr_create_ops->fin(sb); ++ if (au_opt_test(sbinfo->si_mntflags, UDBA_HNOTIFY)) { ++ au_opt_set_udba(sbinfo->si_mntflags, UDBA_NONE); ++ au_remount_refresh(sb, /*do_idop*/0); ++ } ++ if (au_opt_test(sbinfo->si_mntflags, PLINK)) ++ au_plink_put(sb, /*verbose*/1); ++ au_xino_clr(sb); ++ sbinfo->si_sb = NULL; ++ aufs_write_unlock(sb->s_root); ++ au_nwt_flush(&sbinfo->si_nowait); ++ } ++ kill_anon_super(sb); ++} ++ ++struct file_system_type aufs_fs_type = { ++ .name = AUFS_FSTYPE, ++ /* a race between rename and others */ ++ .fs_flags = FS_RENAME_DOES_D_MOVE, ++ .mount = aufs_mount, ++ .kill_sb = aufs_kill_sb, ++ /* no need to __module_get() and module_put(). */ ++ .owner = THIS_MODULE, ++}; +diff --git a/fs/aufs/super.h b/fs/aufs/super.h +new file mode 100644 +index 0000000..4e6e5ab +--- /dev/null ++++ b/fs/aufs/super.h +@@ -0,0 +1,638 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * super_block operations ++ */ ++ ++#ifndef __AUFS_SUPER_H__ ++#define __AUFS_SUPER_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include "rwsem.h" ++#include "spl.h" ++#include "wkq.h" ++ ++/* policies to select one among multiple writable branches */ ++struct au_wbr_copyup_operations { ++ int (*copyup)(struct dentry *dentry); ++}; ++ ++#define AuWbr_DIR 1 /* target is a dir */ ++#define AuWbr_PARENT (1 << 1) /* always require a parent */ ++ ++#define au_ftest_wbr(flags, name) ((flags) & AuWbr_##name) ++#define au_fset_wbr(flags, name) { (flags) |= AuWbr_##name; } ++#define au_fclr_wbr(flags, name) { (flags) &= ~AuWbr_##name; } ++ ++struct au_wbr_create_operations { ++ int (*create)(struct dentry *dentry, unsigned int flags); ++ int (*init)(struct super_block *sb); ++ int (*fin)(struct super_block *sb); ++}; ++ ++struct au_wbr_mfs { ++ struct mutex mfs_lock; /* protect this structure */ ++ unsigned long mfs_jiffy; ++ unsigned long mfs_expire; ++ aufs_bindex_t mfs_bindex; ++ ++ unsigned long long mfsrr_bytes; ++ unsigned long long mfsrr_watermark; ++}; ++ ++#define AuPlink_NHASH 100 ++static inline int au_plink_hash(ino_t ino) ++{ ++ return ino % AuPlink_NHASH; ++} ++ ++/* File-based Hierarchical Storage Management */ ++struct au_fhsm { ++#ifdef CONFIG_AUFS_FHSM ++ /* allow only one process who can receive the notification */ ++ spinlock_t fhsm_spin; ++ pid_t fhsm_pid; ++ wait_queue_head_t fhsm_wqh; ++ atomic_t fhsm_readable; ++ ++ /* these are protected by si_rwsem */ ++ unsigned long fhsm_expire; ++ aufs_bindex_t fhsm_bottom; ++#endif ++}; ++ ++#define AU_PIDSTEP (int)(BITS_TO_LONGS(PID_MAX_DEFAULT) * BITS_PER_LONG) ++#define AU_NPIDMAP (int)DIV_ROUND_UP(PID_MAX_LIMIT, AU_PIDSTEP) ++struct au_si_pid { ++ unsigned long *pid_bitmap[AU_NPIDMAP]; ++ struct mutex pid_mtx; ++}; ++ ++struct au_branch; ++struct au_sbinfo { ++ /* nowait tasks in the system-wide workqueue */ ++ struct au_nowait_tasks si_nowait; ++ ++ /* ++ * tried sb->s_umount, but failed due to the dependecy between i_mutex. ++ * rwsem for au_sbinfo is necessary. ++ */ ++ struct au_rwsem si_rwsem; ++ ++ /* prevent recursive locking in deleting inode */ ++ struct au_si_pid au_si_pid; ++ ++ /* ++ * dirty approach to protect sb->sb_inodes and ->s_files (gone) from ++ * remount. ++ */ ++ struct percpu_counter si_ninodes, si_nfiles; ++ ++ /* branch management */ ++ unsigned int si_generation; ++ ++ /* see AuSi_ flags */ ++ unsigned char au_si_status; ++ ++ aufs_bindex_t si_bbot; ++ ++ /* dirty trick to keep br_id plus */ ++ unsigned int si_last_br_id : ++ sizeof(aufs_bindex_t) * BITS_PER_BYTE - 1; ++ struct au_branch **si_branch; ++ ++ /* policy to select a writable branch */ ++ unsigned char si_wbr_copyup; ++ unsigned char si_wbr_create; ++ struct au_wbr_copyup_operations *si_wbr_copyup_ops; ++ struct au_wbr_create_operations *si_wbr_create_ops; ++ ++ /* round robin */ ++ atomic_t si_wbr_rr_next; ++ ++ /* most free space */ ++ struct au_wbr_mfs si_wbr_mfs; ++ ++ /* File-based Hierarchical Storage Management */ ++ struct au_fhsm si_fhsm; ++ ++ /* mount flags */ ++ /* include/asm-ia64/siginfo.h defines a macro named si_flags */ ++ unsigned int si_mntflags; ++ ++ /* external inode number (bitmap and translation table) */ ++ vfs_readf_t si_xread; ++ vfs_writef_t si_xwrite; ++ struct file *si_xib; ++ struct mutex si_xib_mtx; /* protect xib members */ ++ unsigned long *si_xib_buf; ++ unsigned long si_xib_last_pindex; ++ int si_xib_next_bit; ++ aufs_bindex_t si_xino_brid; ++ unsigned long si_xino_jiffy; ++ unsigned long si_xino_expire; ++ /* reserved for future use */ ++ /* unsigned long long si_xib_limit; */ /* Max xib file size */ ++ ++#ifdef CONFIG_AUFS_EXPORT ++ /* i_generation */ ++ struct file *si_xigen; ++ atomic_t si_xigen_next; ++#endif ++ ++ /* dirty trick to suppoer atomic_open */ ++ struct au_sphlhead si_aopen; ++ ++ /* vdir parameters */ ++ unsigned long si_rdcache; /* max cache time in jiffies */ ++ unsigned int si_rdblk; /* deblk size */ ++ unsigned int si_rdhash; /* hash size */ ++ ++ /* ++ * If the number of whiteouts are larger than si_dirwh, leave all of ++ * them after au_whtmp_ren to reduce the cost of rmdir(2). ++ * future fsck.aufs or kernel thread will remove them later. ++ * Otherwise, remove all whiteouts and the dir in rmdir(2). ++ */ ++ unsigned int si_dirwh; ++ ++ /* pseudo_link list */ ++ struct au_sphlhead si_plink[AuPlink_NHASH]; ++ wait_queue_head_t si_plink_wq; ++ spinlock_t si_plink_maint_lock; ++ pid_t si_plink_maint_pid; ++ ++ /* file list */ ++ struct au_sphlhead si_files; ++ ++ /* with/without getattr, brother of sb->s_d_op */ ++ struct inode_operations *si_iop_array; ++ ++ /* ++ * sysfs and lifetime management. ++ * this is not a small structure and it may be a waste of memory in case ++ * of sysfs is disabled, particulary when many aufs-es are mounted. ++ * but using sysfs is majority. ++ */ ++ struct kobject si_kobj; ++#ifdef CONFIG_DEBUG_FS ++ struct dentry *si_dbgaufs; ++ struct dentry *si_dbgaufs_plink; ++ struct dentry *si_dbgaufs_xib; ++#ifdef CONFIG_AUFS_EXPORT ++ struct dentry *si_dbgaufs_xigen; ++#endif ++#endif ++ ++#ifdef CONFIG_AUFS_SBILIST ++ struct hlist_node si_list; ++#endif ++ ++ /* dirty, necessary for unmounting, sysfs and sysrq */ ++ struct super_block *si_sb; ++}; ++ ++/* sbinfo status flags */ ++/* ++ * set true when refresh_dirs() failed at remount time. ++ * then try refreshing dirs at access time again. ++ * if it is false, refreshing dirs at access time is unnecesary ++ */ ++#define AuSi_FAILED_REFRESH_DIR 1 ++#define AuSi_FHSM (1 << 1) /* fhsm is active now */ ++#define AuSi_NO_DREVAL (1 << 2) /* disable all d_revalidate */ ++ ++#ifndef CONFIG_AUFS_FHSM ++#undef AuSi_FHSM ++#define AuSi_FHSM 0 ++#endif ++ ++static inline unsigned char au_do_ftest_si(struct au_sbinfo *sbi, ++ unsigned int flag) ++{ ++ AuRwMustAnyLock(&sbi->si_rwsem); ++ return sbi->au_si_status & flag; ++} ++#define au_ftest_si(sbinfo, name) au_do_ftest_si(sbinfo, AuSi_##name) ++#define au_fset_si(sbinfo, name) do { \ ++ AuRwMustWriteLock(&(sbinfo)->si_rwsem); \ ++ (sbinfo)->au_si_status |= AuSi_##name; \ ++} while (0) ++#define au_fclr_si(sbinfo, name) do { \ ++ AuRwMustWriteLock(&(sbinfo)->si_rwsem); \ ++ (sbinfo)->au_si_status &= ~AuSi_##name; \ ++} while (0) ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* policy to select one among writable branches */ ++#define AuWbrCopyup(sbinfo, ...) \ ++ ((sbinfo)->si_wbr_copyup_ops->copyup(__VA_ARGS__)) ++#define AuWbrCreate(sbinfo, ...) \ ++ ((sbinfo)->si_wbr_create_ops->create(__VA_ARGS__)) ++ ++/* flags for si_read_lock()/aufs_read_lock()/di_read_lock() */ ++#define AuLock_DW 1 /* write-lock dentry */ ++#define AuLock_IR (1 << 1) /* read-lock inode */ ++#define AuLock_IW (1 << 2) /* write-lock inode */ ++#define AuLock_FLUSH (1 << 3) /* wait for 'nowait' tasks */ ++#define AuLock_DIRS (1 << 4) /* target is a pair of dirs */ ++#define AuLock_NOPLM (1 << 5) /* return err in plm mode */ ++#define AuLock_NOPLMW (1 << 6) /* wait for plm mode ends */ ++#define AuLock_GEN (1 << 7) /* test digen/iigen */ ++#define au_ftest_lock(flags, name) ((flags) & AuLock_##name) ++#define au_fset_lock(flags, name) \ ++ do { (flags) |= AuLock_##name; } while (0) ++#define au_fclr_lock(flags, name) \ ++ do { (flags) &= ~AuLock_##name; } while (0) ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* super.c */ ++extern struct file_system_type aufs_fs_type; ++struct inode *au_iget_locked(struct super_block *sb, ino_t ino); ++typedef unsigned long long (*au_arraycb_t)(struct super_block *sb, void *array, ++ unsigned long long max, void *arg); ++void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb, ++ struct super_block *sb, void *arg); ++struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max); ++void au_iarray_free(struct inode **a, unsigned long long max); ++ ++/* sbinfo.c */ ++void au_si_free(struct kobject *kobj); ++int au_si_alloc(struct super_block *sb); ++int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr); ++ ++unsigned int au_sigen_inc(struct super_block *sb); ++aufs_bindex_t au_new_br_id(struct super_block *sb); ++ ++int si_read_lock(struct super_block *sb, int flags); ++int si_write_lock(struct super_block *sb, int flags); ++int aufs_read_lock(struct dentry *dentry, int flags); ++void aufs_read_unlock(struct dentry *dentry, int flags); ++void aufs_write_lock(struct dentry *dentry); ++void aufs_write_unlock(struct dentry *dentry); ++int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags); ++void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2); ++ ++/* wbr_policy.c */ ++extern struct au_wbr_copyup_operations au_wbr_copyup_ops[]; ++extern struct au_wbr_create_operations au_wbr_create_ops[]; ++int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst); ++int au_wbr_nonopq(struct dentry *dentry, aufs_bindex_t bindex); ++int au_wbr_do_copyup_bu(struct dentry *dentry, aufs_bindex_t btop); ++ ++/* mvdown.c */ ++int au_mvdown(struct dentry *dentry, struct aufs_mvdown __user *arg); ++ ++#ifdef CONFIG_AUFS_FHSM ++/* fhsm.c */ ++ ++static inline pid_t au_fhsm_pid(struct au_fhsm *fhsm) ++{ ++ pid_t pid; ++ ++ spin_lock(&fhsm->fhsm_spin); ++ pid = fhsm->fhsm_pid; ++ spin_unlock(&fhsm->fhsm_spin); ++ ++ return pid; ++} ++ ++void au_fhsm_wrote(struct super_block *sb, aufs_bindex_t bindex, int force); ++void au_fhsm_wrote_all(struct super_block *sb, int force); ++int au_fhsm_fd(struct super_block *sb, int oflags); ++int au_fhsm_br_alloc(struct au_branch *br); ++void au_fhsm_set_bottom(struct super_block *sb, aufs_bindex_t bindex); ++void au_fhsm_fin(struct super_block *sb); ++void au_fhsm_init(struct au_sbinfo *sbinfo); ++void au_fhsm_set(struct au_sbinfo *sbinfo, unsigned int sec); ++void au_fhsm_show(struct seq_file *seq, struct au_sbinfo *sbinfo); ++#else ++AuStubVoid(au_fhsm_wrote, struct super_block *sb, aufs_bindex_t bindex, ++ int force) ++AuStubVoid(au_fhsm_wrote_all, struct super_block *sb, int force) ++AuStub(int, au_fhsm_fd, return -EOPNOTSUPP, struct super_block *sb, int oflags) ++AuStub(pid_t, au_fhsm_pid, return 0, struct au_fhsm *fhsm) ++AuStubInt0(au_fhsm_br_alloc, struct au_branch *br) ++AuStubVoid(au_fhsm_set_bottom, struct super_block *sb, aufs_bindex_t bindex) ++AuStubVoid(au_fhsm_fin, struct super_block *sb) ++AuStubVoid(au_fhsm_init, struct au_sbinfo *sbinfo) ++AuStubVoid(au_fhsm_set, struct au_sbinfo *sbinfo, unsigned int sec) ++AuStubVoid(au_fhsm_show, struct seq_file *seq, struct au_sbinfo *sbinfo) ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct au_sbinfo *au_sbi(struct super_block *sb) ++{ ++ return sb->s_fs_info; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_EXPORT ++int au_test_nfsd(void); ++void au_export_init(struct super_block *sb); ++void au_xigen_inc(struct inode *inode); ++int au_xigen_new(struct inode *inode); ++int au_xigen_set(struct super_block *sb, struct file *base); ++void au_xigen_clr(struct super_block *sb); ++ ++static inline int au_busy_or_stale(void) ++{ ++ if (!au_test_nfsd()) ++ return -EBUSY; ++ return -ESTALE; ++} ++#else ++AuStubInt0(au_test_nfsd, void) ++AuStubVoid(au_export_init, struct super_block *sb) ++AuStubVoid(au_xigen_inc, struct inode *inode) ++AuStubInt0(au_xigen_new, struct inode *inode) ++AuStubInt0(au_xigen_set, struct super_block *sb, struct file *base) ++AuStubVoid(au_xigen_clr, struct super_block *sb) ++AuStub(int, au_busy_or_stale, return -EBUSY, void) ++#endif /* CONFIG_AUFS_EXPORT */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_SBILIST ++/* module.c */ ++extern struct au_sphlhead au_sbilist; ++ ++static inline void au_sbilist_init(void) ++{ ++ au_sphl_init(&au_sbilist); ++} ++ ++static inline void au_sbilist_add(struct super_block *sb) ++{ ++ au_sphl_add(&au_sbi(sb)->si_list, &au_sbilist); ++} ++ ++static inline void au_sbilist_del(struct super_block *sb) ++{ ++ au_sphl_del(&au_sbi(sb)->si_list, &au_sbilist); ++} ++ ++#ifdef CONFIG_AUFS_MAGIC_SYSRQ ++static inline void au_sbilist_lock(void) ++{ ++ spin_lock(&au_sbilist.spin); ++} ++ ++static inline void au_sbilist_unlock(void) ++{ ++ spin_unlock(&au_sbilist.spin); ++} ++#define AuGFP_SBILIST GFP_ATOMIC ++#else ++AuStubVoid(au_sbilist_lock, void) ++AuStubVoid(au_sbilist_unlock, void) ++#define AuGFP_SBILIST GFP_NOFS ++#endif /* CONFIG_AUFS_MAGIC_SYSRQ */ ++#else ++AuStubVoid(au_sbilist_init, void) ++AuStubVoid(au_sbilist_add, struct super_block *sb) ++AuStubVoid(au_sbilist_del, struct super_block *sb) ++AuStubVoid(au_sbilist_lock, void) ++AuStubVoid(au_sbilist_unlock, void) ++#define AuGFP_SBILIST GFP_NOFS ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline void dbgaufs_si_null(struct au_sbinfo *sbinfo) ++{ ++ /* ++ * This function is a dynamic '__init' function actually, ++ * so the tiny check for si_rwsem is unnecessary. ++ */ ++ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ ++#ifdef CONFIG_DEBUG_FS ++ sbinfo->si_dbgaufs = NULL; ++ sbinfo->si_dbgaufs_plink = NULL; ++ sbinfo->si_dbgaufs_xib = NULL; ++#ifdef CONFIG_AUFS_EXPORT ++ sbinfo->si_dbgaufs_xigen = NULL; ++#endif ++#endif ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline void si_pid_idx_bit(int *idx, pid_t *bit) ++{ ++ /* the origin of pid is 1, but the bitmap's is 0 */ ++ *bit = current->pid - 1; ++ *idx = *bit / AU_PIDSTEP; ++ *bit %= AU_PIDSTEP; ++} ++ ++static inline int si_pid_test(struct super_block *sb) ++{ ++ pid_t bit; ++ int idx; ++ unsigned long *bitmap; ++ ++ si_pid_idx_bit(&idx, &bit); ++ bitmap = au_sbi(sb)->au_si_pid.pid_bitmap[idx]; ++ if (bitmap) ++ return test_bit(bit, bitmap); ++ return 0; ++} ++ ++static inline void si_pid_clr(struct super_block *sb) ++{ ++ pid_t bit; ++ int idx; ++ unsigned long *bitmap; ++ ++ si_pid_idx_bit(&idx, &bit); ++ bitmap = au_sbi(sb)->au_si_pid.pid_bitmap[idx]; ++ BUG_ON(!bitmap); ++ AuDebugOn(!test_bit(bit, bitmap)); ++ clear_bit(bit, bitmap); ++ /* smp_mb(); */ ++} ++ ++void si_pid_set(struct super_block *sb); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* lock superblock. mainly for entry point functions */ ++/* ++ * __si_read_lock, __si_write_lock, ++ * __si_read_unlock, __si_write_unlock, __si_downgrade_lock ++ */ ++AuSimpleRwsemFuncs(__si, struct super_block *sb, &au_sbi(sb)->si_rwsem); ++ ++#define SiMustNoWaiters(sb) AuRwMustNoWaiters(&au_sbi(sb)->si_rwsem) ++#define SiMustAnyLock(sb) AuRwMustAnyLock(&au_sbi(sb)->si_rwsem) ++#define SiMustWriteLock(sb) AuRwMustWriteLock(&au_sbi(sb)->si_rwsem) ++ ++static inline void si_noflush_read_lock(struct super_block *sb) ++{ ++ __si_read_lock(sb); ++ si_pid_set(sb); ++} ++ ++static inline int si_noflush_read_trylock(struct super_block *sb) ++{ ++ int locked; ++ ++ locked = __si_read_trylock(sb); ++ if (locked) ++ si_pid_set(sb); ++ return locked; ++} ++ ++static inline void si_noflush_write_lock(struct super_block *sb) ++{ ++ __si_write_lock(sb); ++ si_pid_set(sb); ++} ++ ++static inline int si_noflush_write_trylock(struct super_block *sb) ++{ ++ int locked; ++ ++ locked = __si_write_trylock(sb); ++ if (locked) ++ si_pid_set(sb); ++ return locked; ++} ++ ++#if 0 /* reserved */ ++static inline int si_read_trylock(struct super_block *sb, int flags) ++{ ++ if (au_ftest_lock(flags, FLUSH)) ++ au_nwt_flush(&au_sbi(sb)->si_nowait); ++ return si_noflush_read_trylock(sb); ++} ++#endif ++ ++static inline void si_read_unlock(struct super_block *sb) ++{ ++ si_pid_clr(sb); ++ __si_read_unlock(sb); ++} ++ ++#if 0 /* reserved */ ++static inline int si_write_trylock(struct super_block *sb, int flags) ++{ ++ if (au_ftest_lock(flags, FLUSH)) ++ au_nwt_flush(&au_sbi(sb)->si_nowait); ++ return si_noflush_write_trylock(sb); ++} ++#endif ++ ++static inline void si_write_unlock(struct super_block *sb) ++{ ++ si_pid_clr(sb); ++ __si_write_unlock(sb); ++} ++ ++#if 0 /* reserved */ ++static inline void si_downgrade_lock(struct super_block *sb) ++{ ++ __si_downgrade_lock(sb); ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline aufs_bindex_t au_sbbot(struct super_block *sb) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_bbot; ++} ++ ++static inline unsigned int au_mntflags(struct super_block *sb) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_mntflags; ++} ++ ++static inline unsigned int au_sigen(struct super_block *sb) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_generation; ++} ++ ++static inline unsigned long long au_ninodes(struct super_block *sb) ++{ ++ s64 n = percpu_counter_sum(&au_sbi(sb)->si_ninodes); ++ ++ BUG_ON(n < 0); ++ return n; ++} ++ ++static inline void au_ninodes_inc(struct super_block *sb) ++{ ++ percpu_counter_inc(&au_sbi(sb)->si_ninodes); ++} ++ ++static inline void au_ninodes_dec(struct super_block *sb) ++{ ++ percpu_counter_dec(&au_sbi(sb)->si_ninodes); ++} ++ ++static inline unsigned long long au_nfiles(struct super_block *sb) ++{ ++ s64 n = percpu_counter_sum(&au_sbi(sb)->si_nfiles); ++ ++ BUG_ON(n < 0); ++ return n; ++} ++ ++static inline void au_nfiles_inc(struct super_block *sb) ++{ ++ percpu_counter_inc(&au_sbi(sb)->si_nfiles); ++} ++ ++static inline void au_nfiles_dec(struct super_block *sb) ++{ ++ percpu_counter_dec(&au_sbi(sb)->si_nfiles); ++} ++ ++static inline struct au_branch *au_sbr(struct super_block *sb, ++ aufs_bindex_t bindex) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_branch[0 + bindex]; ++} ++ ++static inline void au_xino_brid_set(struct super_block *sb, aufs_bindex_t brid) ++{ ++ SiMustWriteLock(sb); ++ au_sbi(sb)->si_xino_brid = brid; ++} ++ ++static inline aufs_bindex_t au_xino_brid(struct super_block *sb) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_xino_brid; ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_SUPER_H__ */ +diff --git a/fs/aufs/sysaufs.c b/fs/aufs/sysaufs.c +new file mode 100644 +index 0000000..75c9c24 +--- /dev/null ++++ b/fs/aufs/sysaufs.c +@@ -0,0 +1,104 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * sysfs interface and lifetime management ++ * they are necessary regardless sysfs is disabled. ++ */ ++ ++#include ++#include "aufs.h" ++ ++unsigned long sysaufs_si_mask; ++struct kset *sysaufs_kset; ++ ++#define AuSiAttr(_name) { \ ++ .attr = { .name = __stringify(_name), .mode = 0444 }, \ ++ .show = sysaufs_si_##_name, \ ++} ++ ++static struct sysaufs_si_attr sysaufs_si_attr_xi_path = AuSiAttr(xi_path); ++struct attribute *sysaufs_si_attrs[] = { ++ &sysaufs_si_attr_xi_path.attr, ++ NULL, ++}; ++ ++static const struct sysfs_ops au_sbi_ops = { ++ .show = sysaufs_si_show ++}; ++ ++static struct kobj_type au_sbi_ktype = { ++ .release = au_si_free, ++ .sysfs_ops = &au_sbi_ops, ++ .default_attrs = sysaufs_si_attrs ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++int sysaufs_si_init(struct au_sbinfo *sbinfo) ++{ ++ int err; ++ ++ sbinfo->si_kobj.kset = sysaufs_kset; ++ /* cf. sysaufs_name() */ ++ err = kobject_init_and_add ++ (&sbinfo->si_kobj, &au_sbi_ktype, /*&sysaufs_kset->kobj*/NULL, ++ SysaufsSiNamePrefix "%lx", sysaufs_si_id(sbinfo)); ++ ++ dbgaufs_si_null(sbinfo); ++ if (!err) { ++ err = dbgaufs_si_init(sbinfo); ++ if (unlikely(err)) ++ kobject_put(&sbinfo->si_kobj); ++ } ++ return err; ++} ++ ++void sysaufs_fin(void) ++{ ++ dbgaufs_fin(); ++ sysfs_remove_group(&sysaufs_kset->kobj, sysaufs_attr_group); ++ kset_unregister(sysaufs_kset); ++} ++ ++int __init sysaufs_init(void) ++{ ++ int err; ++ ++ do { ++ get_random_bytes(&sysaufs_si_mask, sizeof(sysaufs_si_mask)); ++ } while (!sysaufs_si_mask); ++ ++ err = -EINVAL; ++ sysaufs_kset = kset_create_and_add(AUFS_NAME, NULL, fs_kobj); ++ if (unlikely(!sysaufs_kset)) ++ goto out; ++ err = PTR_ERR(sysaufs_kset); ++ if (IS_ERR(sysaufs_kset)) ++ goto out; ++ err = sysfs_create_group(&sysaufs_kset->kobj, sysaufs_attr_group); ++ if (unlikely(err)) { ++ kset_unregister(sysaufs_kset); ++ goto out; ++ } ++ ++ err = dbgaufs_init(); ++ if (unlikely(err)) ++ sysaufs_fin(); ++out: ++ return err; ++} +diff --git a/fs/aufs/sysaufs.h b/fs/aufs/sysaufs.h +new file mode 100644 +index 0000000..14975c9 +--- /dev/null ++++ b/fs/aufs/sysaufs.h +@@ -0,0 +1,101 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * sysfs interface and mount lifetime management ++ */ ++ ++#ifndef __SYSAUFS_H__ ++#define __SYSAUFS_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include "module.h" ++ ++struct super_block; ++struct au_sbinfo; ++ ++struct sysaufs_si_attr { ++ struct attribute attr; ++ int (*show)(struct seq_file *seq, struct super_block *sb); ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* sysaufs.c */ ++extern unsigned long sysaufs_si_mask; ++extern struct kset *sysaufs_kset; ++extern struct attribute *sysaufs_si_attrs[]; ++int sysaufs_si_init(struct au_sbinfo *sbinfo); ++int __init sysaufs_init(void); ++void sysaufs_fin(void); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* some people doesn't like to show a pointer in kernel */ ++static inline unsigned long sysaufs_si_id(struct au_sbinfo *sbinfo) ++{ ++ return sysaufs_si_mask ^ (unsigned long)sbinfo; ++} ++ ++#define SysaufsSiNamePrefix "si_" ++#define SysaufsSiNameLen (sizeof(SysaufsSiNamePrefix) + 16) ++static inline void sysaufs_name(struct au_sbinfo *sbinfo, char *name) ++{ ++ snprintf(name, SysaufsSiNameLen, SysaufsSiNamePrefix "%lx", ++ sysaufs_si_id(sbinfo)); ++} ++ ++struct au_branch; ++#ifdef CONFIG_SYSFS ++/* sysfs.c */ ++extern struct attribute_group *sysaufs_attr_group; ++ ++int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb); ++ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, ++ char *buf); ++long au_brinfo_ioctl(struct file *file, unsigned long arg); ++#ifdef CONFIG_COMPAT ++long au_brinfo_compat_ioctl(struct file *file, unsigned long arg); ++#endif ++ ++void sysaufs_br_init(struct au_branch *br); ++void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex); ++void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex); ++ ++#define sysaufs_brs_init() do {} while (0) ++ ++#else ++#define sysaufs_attr_group NULL ++ ++AuStubInt0(sysaufs_si_xi_path, struct seq_file *seq, struct super_block *sb) ++AuStub(ssize_t, sysaufs_si_show, return 0, struct kobject *kobj, ++ struct attribute *attr, char *buf) ++AuStubVoid(sysaufs_br_init, struct au_branch *br) ++AuStubVoid(sysaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex) ++AuStubVoid(sysaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex) ++ ++static inline void sysaufs_brs_init(void) ++{ ++ sysaufs_brs = 0; ++} ++ ++#endif /* CONFIG_SYSFS */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __SYSAUFS_H__ */ +diff --git a/fs/aufs/sysfs.c b/fs/aufs/sysfs.c +new file mode 100644 +index 0000000..ff1c510 +--- /dev/null ++++ b/fs/aufs/sysfs.c +@@ -0,0 +1,376 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * sysfs interface ++ */ ++ ++#include ++#include ++#include "aufs.h" ++ ++#ifdef CONFIG_AUFS_FS_MODULE ++/* this entry violates the "one line per file" policy of sysfs */ ++static ssize_t config_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ ssize_t err; ++ static char *conf = ++/* this file is generated at compiling */ ++#include "conf.str" ++ ; ++ ++ err = snprintf(buf, PAGE_SIZE, conf); ++ if (unlikely(err >= PAGE_SIZE)) ++ err = -EFBIG; ++ return err; ++} ++ ++static struct kobj_attribute au_config_attr = __ATTR_RO(config); ++#endif ++ ++static struct attribute *au_attr[] = { ++#ifdef CONFIG_AUFS_FS_MODULE ++ &au_config_attr.attr, ++#endif ++ NULL, /* need to NULL terminate the list of attributes */ ++}; ++ ++static struct attribute_group sysaufs_attr_group_body = { ++ .attrs = au_attr ++}; ++ ++struct attribute_group *sysaufs_attr_group = &sysaufs_attr_group_body; ++ ++/* ---------------------------------------------------------------------- */ ++ ++int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb) ++{ ++ int err; ++ ++ SiMustAnyLock(sb); ++ ++ err = 0; ++ if (au_opt_test(au_mntflags(sb), XINO)) { ++ err = au_xino_path(seq, au_sbi(sb)->si_xib); ++ seq_putc(seq, '\n'); ++ } ++ return err; ++} ++ ++/* ++ * the lifetime of branch is independent from the entry under sysfs. ++ * sysfs handles the lifetime of the entry, and never call ->show() after it is ++ * unlinked. ++ */ ++static int sysaufs_si_br(struct seq_file *seq, struct super_block *sb, ++ aufs_bindex_t bindex, int idx) ++{ ++ int err; ++ struct path path; ++ struct dentry *root; ++ struct au_branch *br; ++ au_br_perm_str_t perm; ++ ++ AuDbg("b%d\n", bindex); ++ ++ err = 0; ++ root = sb->s_root; ++ di_read_lock_parent(root, !AuLock_IR); ++ br = au_sbr(sb, bindex); ++ ++ switch (idx) { ++ case AuBrSysfs_BR: ++ path.mnt = au_br_mnt(br); ++ path.dentry = au_h_dptr(root, bindex); ++ err = au_seq_path(seq, &path); ++ if (!err) { ++ au_optstr_br_perm(&perm, br->br_perm); ++ seq_printf(seq, "=%s\n", perm.a); ++ } ++ break; ++ case AuBrSysfs_BRID: ++ seq_printf(seq, "%d\n", br->br_id); ++ break; ++ } ++ di_read_unlock(root, !AuLock_IR); ++ if (unlikely(err || seq_has_overflowed(seq))) ++ err = -E2BIG; ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct seq_file *au_seq(char *p, ssize_t len) ++{ ++ struct seq_file *seq; ++ ++ seq = kzalloc(sizeof(*seq), GFP_NOFS); ++ if (seq) { ++ /* mutex_init(&seq.lock); */ ++ seq->buf = p; ++ seq->size = len; ++ return seq; /* success */ ++ } ++ ++ seq = ERR_PTR(-ENOMEM); ++ return seq; ++} ++ ++#define SysaufsBr_PREFIX "br" ++#define SysaufsBrid_PREFIX "brid" ++ ++/* todo: file size may exceed PAGE_SIZE */ ++ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, ++ char *buf) ++{ ++ ssize_t err; ++ int idx; ++ long l; ++ aufs_bindex_t bbot; ++ struct au_sbinfo *sbinfo; ++ struct super_block *sb; ++ struct seq_file *seq; ++ char *name; ++ struct attribute **cattr; ++ ++ sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); ++ sb = sbinfo->si_sb; ++ ++ /* ++ * prevent a race condition between sysfs and aufs. ++ * for instance, sysfs_file_read() calls sysfs_get_active_two() which ++ * prohibits maintaining the sysfs entries. ++ * hew we acquire read lock after sysfs_get_active_two(). ++ * on the other hand, the remount process may maintain the sysfs/aufs ++ * entries after acquiring write lock. ++ * it can cause a deadlock. ++ * simply we gave up processing read here. ++ */ ++ err = -EBUSY; ++ if (unlikely(!si_noflush_read_trylock(sb))) ++ goto out; ++ ++ seq = au_seq(buf, PAGE_SIZE); ++ err = PTR_ERR(seq); ++ if (IS_ERR(seq)) ++ goto out_unlock; ++ ++ name = (void *)attr->name; ++ cattr = sysaufs_si_attrs; ++ while (*cattr) { ++ if (!strcmp(name, (*cattr)->name)) { ++ err = container_of(*cattr, struct sysaufs_si_attr, attr) ++ ->show(seq, sb); ++ goto out_seq; ++ } ++ cattr++; ++ } ++ ++ if (!strncmp(name, SysaufsBrid_PREFIX, ++ sizeof(SysaufsBrid_PREFIX) - 1)) { ++ idx = AuBrSysfs_BRID; ++ name += sizeof(SysaufsBrid_PREFIX) - 1; ++ } else if (!strncmp(name, SysaufsBr_PREFIX, ++ sizeof(SysaufsBr_PREFIX) - 1)) { ++ idx = AuBrSysfs_BR; ++ name += sizeof(SysaufsBr_PREFIX) - 1; ++ } else ++ BUG(); ++ ++ err = kstrtol(name, 10, &l); ++ if (!err) { ++ bbot = au_sbbot(sb); ++ if (l <= bbot) ++ err = sysaufs_si_br(seq, sb, (aufs_bindex_t)l, idx); ++ else ++ err = -ENOENT; ++ } ++ ++out_seq: ++ if (!err) { ++ err = seq->count; ++ /* sysfs limit */ ++ if (unlikely(err == PAGE_SIZE)) ++ err = -EFBIG; ++ } ++ kfree(seq); ++out_unlock: ++ si_read_unlock(sb); ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_brinfo(struct super_block *sb, union aufs_brinfo __user *arg) ++{ ++ int err; ++ int16_t brid; ++ aufs_bindex_t bindex, bbot; ++ size_t sz; ++ char *buf; ++ struct seq_file *seq; ++ struct au_branch *br; ++ ++ si_read_lock(sb, AuLock_FLUSH); ++ bbot = au_sbbot(sb); ++ err = bbot + 1; ++ if (!arg) ++ goto out; ++ ++ err = -ENOMEM; ++ buf = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!buf)) ++ goto out; ++ ++ seq = au_seq(buf, PAGE_SIZE); ++ err = PTR_ERR(seq); ++ if (IS_ERR(seq)) ++ goto out_buf; ++ ++ sz = sizeof(*arg) - offsetof(union aufs_brinfo, path); ++ for (bindex = 0; bindex <= bbot; bindex++, arg++) { ++ err = !access_ok(VERIFY_WRITE, arg, sizeof(*arg)); ++ if (unlikely(err)) ++ break; ++ ++ br = au_sbr(sb, bindex); ++ brid = br->br_id; ++ BUILD_BUG_ON(sizeof(brid) != sizeof(arg->id)); ++ err = __put_user(brid, &arg->id); ++ if (unlikely(err)) ++ break; ++ ++ BUILD_BUG_ON(sizeof(br->br_perm) != sizeof(arg->perm)); ++ err = __put_user(br->br_perm, &arg->perm); ++ if (unlikely(err)) ++ break; ++ ++ err = au_seq_path(seq, &br->br_path); ++ if (unlikely(err)) ++ break; ++ seq_putc(seq, '\0'); ++ if (!seq_has_overflowed(seq)) { ++ err = copy_to_user(arg->path, seq->buf, seq->count); ++ seq->count = 0; ++ if (unlikely(err)) ++ break; ++ } else { ++ err = -E2BIG; ++ goto out_seq; ++ } ++ } ++ if (unlikely(err)) ++ err = -EFAULT; ++ ++out_seq: ++ kfree(seq); ++out_buf: ++ free_page((unsigned long)buf); ++out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++long au_brinfo_ioctl(struct file *file, unsigned long arg) ++{ ++ return au_brinfo(file->f_path.dentry->d_sb, (void __user *)arg); ++} ++ ++#ifdef CONFIG_COMPAT ++long au_brinfo_compat_ioctl(struct file *file, unsigned long arg) ++{ ++ return au_brinfo(file->f_path.dentry->d_sb, compat_ptr(arg)); ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++void sysaufs_br_init(struct au_branch *br) ++{ ++ int i; ++ struct au_brsysfs *br_sysfs; ++ struct attribute *attr; ++ ++ br_sysfs = br->br_sysfs; ++ for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) { ++ attr = &br_sysfs->attr; ++ sysfs_attr_init(attr); ++ attr->name = br_sysfs->name; ++ attr->mode = S_IRUGO; ++ br_sysfs++; ++ } ++} ++ ++void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ struct au_branch *br; ++ struct kobject *kobj; ++ struct au_brsysfs *br_sysfs; ++ int i; ++ aufs_bindex_t bbot; ++ ++ dbgaufs_brs_del(sb, bindex); ++ ++ if (!sysaufs_brs) ++ return; ++ ++ kobj = &au_sbi(sb)->si_kobj; ++ bbot = au_sbbot(sb); ++ for (; bindex <= bbot; bindex++) { ++ br = au_sbr(sb, bindex); ++ br_sysfs = br->br_sysfs; ++ for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) { ++ sysfs_remove_file(kobj, &br_sysfs->attr); ++ br_sysfs++; ++ } ++ } ++} ++ ++void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ int err, i; ++ aufs_bindex_t bbot; ++ struct kobject *kobj; ++ struct au_branch *br; ++ struct au_brsysfs *br_sysfs; ++ ++ dbgaufs_brs_add(sb, bindex); ++ ++ if (!sysaufs_brs) ++ return; ++ ++ kobj = &au_sbi(sb)->si_kobj; ++ bbot = au_sbbot(sb); ++ for (; bindex <= bbot; bindex++) { ++ br = au_sbr(sb, bindex); ++ br_sysfs = br->br_sysfs; ++ snprintf(br_sysfs[AuBrSysfs_BR].name, sizeof(br_sysfs->name), ++ SysaufsBr_PREFIX "%d", bindex); ++ snprintf(br_sysfs[AuBrSysfs_BRID].name, sizeof(br_sysfs->name), ++ SysaufsBrid_PREFIX "%d", bindex); ++ for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) { ++ err = sysfs_create_file(kobj, &br_sysfs->attr); ++ if (unlikely(err)) ++ pr_warn("failed %s under sysfs(%d)\n", ++ br_sysfs->name, err); ++ br_sysfs++; ++ } ++ } ++} +diff --git a/fs/aufs/sysrq.c b/fs/aufs/sysrq.c +new file mode 100644 +index 0000000..cbebb37 +--- /dev/null ++++ b/fs/aufs/sysrq.c +@@ -0,0 +1,157 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * magic sysrq hanlder ++ */ ++ ++/* #include */ ++#include ++#include "aufs.h" ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void sysrq_sb(struct super_block *sb) ++{ ++ char *plevel; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ struct au_sphlhead *files; ++ struct au_finfo *finfo; ++ ++ plevel = au_plevel; ++ au_plevel = KERN_WARNING; ++ ++ /* since we define pr_fmt, call printk directly */ ++#define pr(str) printk(KERN_WARNING AUFS_NAME ": " str) ++ ++ sbinfo = au_sbi(sb); ++ printk(KERN_WARNING "si=%lx\n", sysaufs_si_id(sbinfo)); ++ pr("superblock\n"); ++ au_dpri_sb(sb); ++ ++#if 0 ++ pr("root dentry\n"); ++ au_dpri_dentry(sb->s_root); ++ pr("root inode\n"); ++ au_dpri_inode(d_inode(sb->s_root)); ++#endif ++ ++#if 0 ++ do { ++ int err, i, j, ndentry; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ ++ err = au_dpages_init(&dpages, GFP_ATOMIC); ++ if (unlikely(err)) ++ break; ++ err = au_dcsub_pages(&dpages, sb->s_root, NULL, NULL); ++ if (!err) ++ for (i = 0; i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ ndentry = dpage->ndentry; ++ for (j = 0; j < ndentry; j++) ++ au_dpri_dentry(dpage->dentries[j]); ++ } ++ au_dpages_free(&dpages); ++ } while (0); ++#endif ++ ++#if 1 ++ { ++ struct inode *i; ++ ++ pr("isolated inode\n"); ++ spin_lock(&sb->s_inode_list_lock); ++ list_for_each_entry(i, &sb->s_inodes, i_sb_list) { ++ spin_lock(&i->i_lock); ++ if (1 || hlist_empty(&i->i_dentry)) ++ au_dpri_inode(i); ++ spin_unlock(&i->i_lock); ++ } ++ spin_unlock(&sb->s_inode_list_lock); ++ } ++#endif ++ pr("files\n"); ++ files = &au_sbi(sb)->si_files; ++ spin_lock(&files->spin); ++ hlist_for_each_entry(finfo, &files->head, fi_hlist) { ++ umode_t mode; ++ ++ file = finfo->fi_file; ++ mode = file_inode(file)->i_mode; ++ if (!special_file(mode)) ++ au_dpri_file(file); ++ } ++ spin_unlock(&files->spin); ++ pr("done\n"); ++ ++#undef pr ++ au_plevel = plevel; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* module parameter */ ++static char *aufs_sysrq_key = "a"; ++module_param_named(sysrq, aufs_sysrq_key, charp, S_IRUGO); ++MODULE_PARM_DESC(sysrq, "MagicSysRq key for " AUFS_NAME); ++ ++static void au_sysrq(int key __maybe_unused) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ lockdep_off(); ++ au_sbilist_lock(); ++ hlist_for_each_entry(sbinfo, &au_sbilist.head, si_list) ++ sysrq_sb(sbinfo->si_sb); ++ au_sbilist_unlock(); ++ lockdep_on(); ++} ++ ++static struct sysrq_key_op au_sysrq_op = { ++ .handler = au_sysrq, ++ .help_msg = "Aufs", ++ .action_msg = "Aufs", ++ .enable_mask = SYSRQ_ENABLE_DUMP ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++int __init au_sysrq_init(void) ++{ ++ int err; ++ char key; ++ ++ err = -1; ++ key = *aufs_sysrq_key; ++ if ('a' <= key && key <= 'z') ++ err = register_sysrq_key(key, &au_sysrq_op); ++ if (unlikely(err)) ++ pr_err("err %d, sysrq=%c\n", err, key); ++ return err; ++} ++ ++void au_sysrq_fin(void) ++{ ++ int err; ++ ++ err = unregister_sysrq_key(*aufs_sysrq_key, &au_sysrq_op); ++ if (unlikely(err)) ++ pr_err("err %d (ignored)\n", err); ++} +diff --git a/fs/aufs/vdir.c b/fs/aufs/vdir.c +new file mode 100644 +index 0000000..fc5ae1e +--- /dev/null ++++ b/fs/aufs/vdir.c +@@ -0,0 +1,888 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * virtual or vertical directory ++ */ ++ ++#include "aufs.h" ++ ++static unsigned int calc_size(int nlen) ++{ ++ return ALIGN(sizeof(struct au_vdir_de) + nlen, sizeof(ino_t)); ++} ++ ++static int set_deblk_end(union au_vdir_deblk_p *p, ++ union au_vdir_deblk_p *deblk_end) ++{ ++ if (calc_size(0) <= deblk_end->deblk - p->deblk) { ++ p->de->de_str.len = 0; ++ /* smp_mb(); */ ++ return 0; ++ } ++ return -1; /* error */ ++} ++ ++/* returns true or false */ ++static int is_deblk_end(union au_vdir_deblk_p *p, ++ union au_vdir_deblk_p *deblk_end) ++{ ++ if (calc_size(0) <= deblk_end->deblk - p->deblk) ++ return !p->de->de_str.len; ++ return 1; ++} ++ ++static unsigned char *last_deblk(struct au_vdir *vdir) ++{ ++ return vdir->vd_deblk[vdir->vd_nblk - 1]; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* estimate the appropriate size for name hash table */ ++unsigned int au_rdhash_est(loff_t sz) ++{ ++ unsigned int n; ++ ++ n = UINT_MAX; ++ sz >>= 10; ++ if (sz < n) ++ n = sz; ++ if (sz < AUFS_RDHASH_DEF) ++ n = AUFS_RDHASH_DEF; ++ /* pr_info("n %u\n", n); */ ++ return n; ++} ++ ++/* ++ * the allocated memory has to be freed by ++ * au_nhash_wh_free() or au_nhash_de_free(). ++ */ ++int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp) ++{ ++ struct hlist_head *head; ++ unsigned int u; ++ size_t sz; ++ ++ sz = sizeof(*nhash->nh_head) * num_hash; ++ head = kmalloc(sz, gfp); ++ if (head) { ++ nhash->nh_num = num_hash; ++ nhash->nh_head = head; ++ for (u = 0; u < num_hash; u++) ++ INIT_HLIST_HEAD(head++); ++ return 0; /* success */ ++ } ++ ++ return -ENOMEM; ++} ++ ++static void nhash_count(struct hlist_head *head) ++{ ++#if 0 ++ unsigned long n; ++ struct hlist_node *pos; ++ ++ n = 0; ++ hlist_for_each(pos, head) ++ n++; ++ pr_info("%lu\n", n); ++#endif ++} ++ ++static void au_nhash_wh_do_free(struct hlist_head *head) ++{ ++ struct au_vdir_wh *pos; ++ struct hlist_node *node; ++ ++ hlist_for_each_entry_safe(pos, node, head, wh_hash) ++ kfree(pos); ++} ++ ++static void au_nhash_de_do_free(struct hlist_head *head) ++{ ++ struct au_vdir_dehstr *pos; ++ struct hlist_node *node; ++ ++ hlist_for_each_entry_safe(pos, node, head, hash) ++ au_cache_free_vdir_dehstr(pos); ++} ++ ++static void au_nhash_do_free(struct au_nhash *nhash, ++ void (*free)(struct hlist_head *head)) ++{ ++ unsigned int n; ++ struct hlist_head *head; ++ ++ n = nhash->nh_num; ++ if (!n) ++ return; ++ ++ head = nhash->nh_head; ++ while (n-- > 0) { ++ nhash_count(head); ++ free(head++); ++ } ++ kfree(nhash->nh_head); ++} ++ ++void au_nhash_wh_free(struct au_nhash *whlist) ++{ ++ au_nhash_do_free(whlist, au_nhash_wh_do_free); ++} ++ ++static void au_nhash_de_free(struct au_nhash *delist) ++{ ++ au_nhash_do_free(delist, au_nhash_de_do_free); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt, ++ int limit) ++{ ++ int num; ++ unsigned int u, n; ++ struct hlist_head *head; ++ struct au_vdir_wh *pos; ++ ++ num = 0; ++ n = whlist->nh_num; ++ head = whlist->nh_head; ++ for (u = 0; u < n; u++, head++) ++ hlist_for_each_entry(pos, head, wh_hash) ++ if (pos->wh_bindex == btgt && ++num > limit) ++ return 1; ++ return 0; ++} ++ ++static struct hlist_head *au_name_hash(struct au_nhash *nhash, ++ unsigned char *name, ++ unsigned int len) ++{ ++ unsigned int v; ++ /* const unsigned int magic_bit = 12; */ ++ ++ AuDebugOn(!nhash->nh_num || !nhash->nh_head); ++ ++ v = 0; ++ while (len--) ++ v += *name++; ++ /* v = hash_long(v, magic_bit); */ ++ v %= nhash->nh_num; ++ return nhash->nh_head + v; ++} ++ ++static int au_nhash_test_name(struct au_vdir_destr *str, const char *name, ++ int nlen) ++{ ++ return str->len == nlen && !memcmp(str->name, name, nlen); ++} ++ ++/* returns found or not */ ++int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen) ++{ ++ struct hlist_head *head; ++ struct au_vdir_wh *pos; ++ struct au_vdir_destr *str; ++ ++ head = au_name_hash(whlist, name, nlen); ++ hlist_for_each_entry(pos, head, wh_hash) { ++ str = &pos->wh_str; ++ AuDbg("%.*s\n", str->len, str->name); ++ if (au_nhash_test_name(str, name, nlen)) ++ return 1; ++ } ++ return 0; ++} ++ ++/* returns found(true) or not */ ++static int test_known(struct au_nhash *delist, char *name, int nlen) ++{ ++ struct hlist_head *head; ++ struct au_vdir_dehstr *pos; ++ struct au_vdir_destr *str; ++ ++ head = au_name_hash(delist, name, nlen); ++ hlist_for_each_entry(pos, head, hash) { ++ str = pos->str; ++ AuDbg("%.*s\n", str->len, str->name); ++ if (au_nhash_test_name(str, name, nlen)) ++ return 1; ++ } ++ return 0; ++} ++ ++static void au_shwh_init_wh(struct au_vdir_wh *wh, ino_t ino, ++ unsigned char d_type) ++{ ++#ifdef CONFIG_AUFS_SHWH ++ wh->wh_ino = ino; ++ wh->wh_type = d_type; ++#endif ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino, ++ unsigned int d_type, aufs_bindex_t bindex, ++ unsigned char shwh) ++{ ++ int err; ++ struct au_vdir_destr *str; ++ struct au_vdir_wh *wh; ++ ++ AuDbg("%.*s\n", nlen, name); ++ AuDebugOn(!whlist->nh_num || !whlist->nh_head); ++ ++ err = -ENOMEM; ++ wh = kmalloc(sizeof(*wh) + nlen, GFP_NOFS); ++ if (unlikely(!wh)) ++ goto out; ++ ++ err = 0; ++ wh->wh_bindex = bindex; ++ if (shwh) ++ au_shwh_init_wh(wh, ino, d_type); ++ str = &wh->wh_str; ++ str->len = nlen; ++ memcpy(str->name, name, nlen); ++ hlist_add_head(&wh->wh_hash, au_name_hash(whlist, name, nlen)); ++ /* smp_mb(); */ ++ ++out: ++ return err; ++} ++ ++static int append_deblk(struct au_vdir *vdir) ++{ ++ int err; ++ unsigned long ul; ++ const unsigned int deblk_sz = vdir->vd_deblk_sz; ++ union au_vdir_deblk_p p, deblk_end; ++ unsigned char **o; ++ ++ err = -ENOMEM; ++ o = krealloc(vdir->vd_deblk, sizeof(*o) * (vdir->vd_nblk + 1), ++ GFP_NOFS); ++ if (unlikely(!o)) ++ goto out; ++ ++ vdir->vd_deblk = o; ++ p.deblk = kmalloc(deblk_sz, GFP_NOFS); ++ if (p.deblk) { ++ ul = vdir->vd_nblk++; ++ vdir->vd_deblk[ul] = p.deblk; ++ vdir->vd_last.ul = ul; ++ vdir->vd_last.p.deblk = p.deblk; ++ deblk_end.deblk = p.deblk + deblk_sz; ++ err = set_deblk_end(&p, &deblk_end); ++ } ++ ++out: ++ return err; ++} ++ ++static int append_de(struct au_vdir *vdir, char *name, int nlen, ino_t ino, ++ unsigned int d_type, struct au_nhash *delist) ++{ ++ int err; ++ unsigned int sz; ++ const unsigned int deblk_sz = vdir->vd_deblk_sz; ++ union au_vdir_deblk_p p, *room, deblk_end; ++ struct au_vdir_dehstr *dehstr; ++ ++ p.deblk = last_deblk(vdir); ++ deblk_end.deblk = p.deblk + deblk_sz; ++ room = &vdir->vd_last.p; ++ AuDebugOn(room->deblk < p.deblk || deblk_end.deblk <= room->deblk ++ || !is_deblk_end(room, &deblk_end)); ++ ++ sz = calc_size(nlen); ++ if (unlikely(sz > deblk_end.deblk - room->deblk)) { ++ err = append_deblk(vdir); ++ if (unlikely(err)) ++ goto out; ++ ++ p.deblk = last_deblk(vdir); ++ deblk_end.deblk = p.deblk + deblk_sz; ++ /* smp_mb(); */ ++ AuDebugOn(room->deblk != p.deblk); ++ } ++ ++ err = -ENOMEM; ++ dehstr = au_cache_alloc_vdir_dehstr(); ++ if (unlikely(!dehstr)) ++ goto out; ++ ++ dehstr->str = &room->de->de_str; ++ hlist_add_head(&dehstr->hash, au_name_hash(delist, name, nlen)); ++ room->de->de_ino = ino; ++ room->de->de_type = d_type; ++ room->de->de_str.len = nlen; ++ memcpy(room->de->de_str.name, name, nlen); ++ ++ err = 0; ++ room->deblk += sz; ++ if (unlikely(set_deblk_end(room, &deblk_end))) ++ err = append_deblk(vdir); ++ /* smp_mb(); */ ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_vdir_free(struct au_vdir *vdir) ++{ ++ unsigned char **deblk; ++ ++ deblk = vdir->vd_deblk; ++ while (vdir->vd_nblk--) ++ kfree(*deblk++); ++ kfree(vdir->vd_deblk); ++ au_cache_free_vdir(vdir); ++} ++ ++static struct au_vdir *alloc_vdir(struct file *file) ++{ ++ struct au_vdir *vdir; ++ struct super_block *sb; ++ int err; ++ ++ sb = file->f_path.dentry->d_sb; ++ SiMustAnyLock(sb); ++ ++ err = -ENOMEM; ++ vdir = au_cache_alloc_vdir(); ++ if (unlikely(!vdir)) ++ goto out; ++ ++ vdir->vd_deblk = kzalloc(sizeof(*vdir->vd_deblk), GFP_NOFS); ++ if (unlikely(!vdir->vd_deblk)) ++ goto out_free; ++ ++ vdir->vd_deblk_sz = au_sbi(sb)->si_rdblk; ++ if (!vdir->vd_deblk_sz) { ++ /* estimate the appropriate size for deblk */ ++ vdir->vd_deblk_sz = au_dir_size(file, /*dentry*/NULL); ++ /* pr_info("vd_deblk_sz %u\n", vdir->vd_deblk_sz); */ ++ } ++ vdir->vd_nblk = 0; ++ vdir->vd_version = 0; ++ vdir->vd_jiffy = 0; ++ err = append_deblk(vdir); ++ if (!err) ++ return vdir; /* success */ ++ ++ kfree(vdir->vd_deblk); ++ ++out_free: ++ au_cache_free_vdir(vdir); ++out: ++ vdir = ERR_PTR(err); ++ return vdir; ++} ++ ++static int reinit_vdir(struct au_vdir *vdir) ++{ ++ int err; ++ union au_vdir_deblk_p p, deblk_end; ++ ++ while (vdir->vd_nblk > 1) { ++ kfree(vdir->vd_deblk[vdir->vd_nblk - 1]); ++ /* vdir->vd_deblk[vdir->vd_nblk - 1] = NULL; */ ++ vdir->vd_nblk--; ++ } ++ p.deblk = vdir->vd_deblk[0]; ++ deblk_end.deblk = p.deblk + vdir->vd_deblk_sz; ++ err = set_deblk_end(&p, &deblk_end); ++ /* keep vd_dblk_sz */ ++ vdir->vd_last.ul = 0; ++ vdir->vd_last.p.deblk = vdir->vd_deblk[0]; ++ vdir->vd_version = 0; ++ vdir->vd_jiffy = 0; ++ /* smp_mb(); */ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define AuFillVdir_CALLED 1 ++#define AuFillVdir_WHABLE (1 << 1) ++#define AuFillVdir_SHWH (1 << 2) ++#define au_ftest_fillvdir(flags, name) ((flags) & AuFillVdir_##name) ++#define au_fset_fillvdir(flags, name) \ ++ do { (flags) |= AuFillVdir_##name; } while (0) ++#define au_fclr_fillvdir(flags, name) \ ++ do { (flags) &= ~AuFillVdir_##name; } while (0) ++ ++#ifndef CONFIG_AUFS_SHWH ++#undef AuFillVdir_SHWH ++#define AuFillVdir_SHWH 0 ++#endif ++ ++struct fillvdir_arg { ++ struct dir_context ctx; ++ struct file *file; ++ struct au_vdir *vdir; ++ struct au_nhash delist; ++ struct au_nhash whlist; ++ aufs_bindex_t bindex; ++ unsigned int flags; ++ int err; ++}; ++ ++static int fillvdir(struct dir_context *ctx, const char *__name, int nlen, ++ loff_t offset __maybe_unused, u64 h_ino, ++ unsigned int d_type) ++{ ++ struct fillvdir_arg *arg = container_of(ctx, struct fillvdir_arg, ctx); ++ char *name = (void *)__name; ++ struct super_block *sb; ++ ino_t ino; ++ const unsigned char shwh = !!au_ftest_fillvdir(arg->flags, SHWH); ++ ++ arg->err = 0; ++ sb = arg->file->f_path.dentry->d_sb; ++ au_fset_fillvdir(arg->flags, CALLED); ++ /* smp_mb(); */ ++ if (nlen <= AUFS_WH_PFX_LEN ++ || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { ++ if (test_known(&arg->delist, name, nlen) ++ || au_nhash_test_known_wh(&arg->whlist, name, nlen)) ++ goto out; /* already exists or whiteouted */ ++ ++ arg->err = au_ino(sb, arg->bindex, h_ino, d_type, &ino); ++ if (!arg->err) { ++ if (unlikely(nlen > AUFS_MAX_NAMELEN)) ++ d_type = DT_UNKNOWN; ++ arg->err = append_de(arg->vdir, name, nlen, ino, ++ d_type, &arg->delist); ++ } ++ } else if (au_ftest_fillvdir(arg->flags, WHABLE)) { ++ name += AUFS_WH_PFX_LEN; ++ nlen -= AUFS_WH_PFX_LEN; ++ if (au_nhash_test_known_wh(&arg->whlist, name, nlen)) ++ goto out; /* already whiteouted */ ++ ++ if (shwh) ++ arg->err = au_wh_ino(sb, arg->bindex, h_ino, d_type, ++ &ino); ++ if (!arg->err) { ++ if (nlen <= AUFS_MAX_NAMELEN + AUFS_WH_PFX_LEN) ++ d_type = DT_UNKNOWN; ++ arg->err = au_nhash_append_wh ++ (&arg->whlist, name, nlen, ino, d_type, ++ arg->bindex, shwh); ++ } ++ } ++ ++out: ++ if (!arg->err) ++ arg->vdir->vd_jiffy = jiffies; ++ /* smp_mb(); */ ++ AuTraceErr(arg->err); ++ return arg->err; ++} ++ ++static int au_handle_shwh(struct super_block *sb, struct au_vdir *vdir, ++ struct au_nhash *whlist, struct au_nhash *delist) ++{ ++#ifdef CONFIG_AUFS_SHWH ++ int err; ++ unsigned int nh, u; ++ struct hlist_head *head; ++ struct au_vdir_wh *pos; ++ struct hlist_node *n; ++ char *p, *o; ++ struct au_vdir_destr *destr; ++ ++ AuDebugOn(!au_opt_test(au_mntflags(sb), SHWH)); ++ ++ err = -ENOMEM; ++ o = p = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!p)) ++ goto out; ++ ++ err = 0; ++ nh = whlist->nh_num; ++ memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); ++ p += AUFS_WH_PFX_LEN; ++ for (u = 0; u < nh; u++) { ++ head = whlist->nh_head + u; ++ hlist_for_each_entry_safe(pos, n, head, wh_hash) { ++ destr = &pos->wh_str; ++ memcpy(p, destr->name, destr->len); ++ err = append_de(vdir, o, destr->len + AUFS_WH_PFX_LEN, ++ pos->wh_ino, pos->wh_type, delist); ++ if (unlikely(err)) ++ break; ++ } ++ } ++ ++ free_page((unsigned long)o); ++ ++out: ++ AuTraceErr(err); ++ return err; ++#else ++ return 0; ++#endif ++} ++ ++static int au_do_read_vdir(struct fillvdir_arg *arg) ++{ ++ int err; ++ unsigned int rdhash; ++ loff_t offset; ++ aufs_bindex_t bbot, bindex, btop; ++ unsigned char shwh; ++ struct file *hf, *file; ++ struct super_block *sb; ++ ++ file = arg->file; ++ sb = file->f_path.dentry->d_sb; ++ SiMustAnyLock(sb); ++ ++ rdhash = au_sbi(sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = au_rdhash_est(au_dir_size(file, /*dentry*/NULL)); ++ err = au_nhash_alloc(&arg->delist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_nhash_alloc(&arg->whlist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out_delist; ++ ++ err = 0; ++ arg->flags = 0; ++ shwh = 0; ++ if (au_opt_test(au_mntflags(sb), SHWH)) { ++ shwh = 1; ++ au_fset_fillvdir(arg->flags, SHWH); ++ } ++ btop = au_fbtop(file); ++ bbot = au_fbbot_dir(file); ++ for (bindex = btop; !err && bindex <= bbot; bindex++) { ++ hf = au_hf_dir(file, bindex); ++ if (!hf) ++ continue; ++ ++ offset = vfsub_llseek(hf, 0, SEEK_SET); ++ err = offset; ++ if (unlikely(offset)) ++ break; ++ ++ arg->bindex = bindex; ++ au_fclr_fillvdir(arg->flags, WHABLE); ++ if (shwh ++ || (bindex != bbot ++ && au_br_whable(au_sbr_perm(sb, bindex)))) ++ au_fset_fillvdir(arg->flags, WHABLE); ++ do { ++ arg->err = 0; ++ au_fclr_fillvdir(arg->flags, CALLED); ++ /* smp_mb(); */ ++ err = vfsub_iterate_dir(hf, &arg->ctx); ++ if (err >= 0) ++ err = arg->err; ++ } while (!err && au_ftest_fillvdir(arg->flags, CALLED)); ++ ++ /* ++ * dir_relax() may be good for concurrency, but aufs should not ++ * use it since it will cause a lockdep problem. ++ */ ++ } ++ ++ if (!err && shwh) ++ err = au_handle_shwh(sb, arg->vdir, &arg->whlist, &arg->delist); ++ ++ au_nhash_wh_free(&arg->whlist); ++ ++out_delist: ++ au_nhash_de_free(&arg->delist); ++out: ++ return err; ++} ++ ++static int read_vdir(struct file *file, int may_read) ++{ ++ int err; ++ unsigned long expire; ++ unsigned char do_read; ++ struct fillvdir_arg arg = { ++ .ctx = { ++ .actor = fillvdir ++ } ++ }; ++ struct inode *inode; ++ struct au_vdir *vdir, *allocated; ++ ++ err = 0; ++ inode = file_inode(file); ++ IMustLock(inode); ++ SiMustAnyLock(inode->i_sb); ++ ++ allocated = NULL; ++ do_read = 0; ++ expire = au_sbi(inode->i_sb)->si_rdcache; ++ vdir = au_ivdir(inode); ++ if (!vdir) { ++ do_read = 1; ++ vdir = alloc_vdir(file); ++ err = PTR_ERR(vdir); ++ if (IS_ERR(vdir)) ++ goto out; ++ err = 0; ++ allocated = vdir; ++ } else if (may_read ++ && (inode->i_version != vdir->vd_version ++ || time_after(jiffies, vdir->vd_jiffy + expire))) { ++ do_read = 1; ++ err = reinit_vdir(vdir); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ if (!do_read) ++ return 0; /* success */ ++ ++ arg.file = file; ++ arg.vdir = vdir; ++ err = au_do_read_vdir(&arg); ++ if (!err) { ++ /* file->f_pos = 0; */ /* todo: ctx->pos? */ ++ vdir->vd_version = inode->i_version; ++ vdir->vd_last.ul = 0; ++ vdir->vd_last.p.deblk = vdir->vd_deblk[0]; ++ if (allocated) ++ au_set_ivdir(inode, allocated); ++ } else if (allocated) ++ au_vdir_free(allocated); ++ ++out: ++ return err; ++} ++ ++static int copy_vdir(struct au_vdir *tgt, struct au_vdir *src) ++{ ++ int err, rerr; ++ unsigned long ul, n; ++ const unsigned int deblk_sz = src->vd_deblk_sz; ++ ++ AuDebugOn(tgt->vd_nblk != 1); ++ ++ err = -ENOMEM; ++ if (tgt->vd_nblk < src->vd_nblk) { ++ unsigned char **p; ++ ++ p = krealloc(tgt->vd_deblk, sizeof(*p) * src->vd_nblk, ++ GFP_NOFS); ++ if (unlikely(!p)) ++ goto out; ++ tgt->vd_deblk = p; ++ } ++ ++ if (tgt->vd_deblk_sz != deblk_sz) { ++ unsigned char *p; ++ ++ tgt->vd_deblk_sz = deblk_sz; ++ p = krealloc(tgt->vd_deblk[0], deblk_sz, GFP_NOFS); ++ if (unlikely(!p)) ++ goto out; ++ tgt->vd_deblk[0] = p; ++ } ++ memcpy(tgt->vd_deblk[0], src->vd_deblk[0], deblk_sz); ++ tgt->vd_version = src->vd_version; ++ tgt->vd_jiffy = src->vd_jiffy; ++ ++ n = src->vd_nblk; ++ for (ul = 1; ul < n; ul++) { ++ tgt->vd_deblk[ul] = kmemdup(src->vd_deblk[ul], deblk_sz, ++ GFP_NOFS); ++ if (unlikely(!tgt->vd_deblk[ul])) ++ goto out; ++ tgt->vd_nblk++; ++ } ++ tgt->vd_nblk = n; ++ tgt->vd_last.ul = tgt->vd_last.ul; ++ tgt->vd_last.p.deblk = tgt->vd_deblk[tgt->vd_last.ul]; ++ tgt->vd_last.p.deblk += src->vd_last.p.deblk ++ - src->vd_deblk[src->vd_last.ul]; ++ /* smp_mb(); */ ++ return 0; /* success */ ++ ++out: ++ rerr = reinit_vdir(tgt); ++ BUG_ON(rerr); ++ return err; ++} ++ ++int au_vdir_init(struct file *file) ++{ ++ int err; ++ struct inode *inode; ++ struct au_vdir *vdir_cache, *allocated; ++ ++ /* test file->f_pos here instead of ctx->pos */ ++ err = read_vdir(file, !file->f_pos); ++ if (unlikely(err)) ++ goto out; ++ ++ allocated = NULL; ++ vdir_cache = au_fvdir_cache(file); ++ if (!vdir_cache) { ++ vdir_cache = alloc_vdir(file); ++ err = PTR_ERR(vdir_cache); ++ if (IS_ERR(vdir_cache)) ++ goto out; ++ allocated = vdir_cache; ++ } else if (!file->f_pos && vdir_cache->vd_version != file->f_version) { ++ /* test file->f_pos here instead of ctx->pos */ ++ err = reinit_vdir(vdir_cache); ++ if (unlikely(err)) ++ goto out; ++ } else ++ return 0; /* success */ ++ ++ inode = file_inode(file); ++ err = copy_vdir(vdir_cache, au_ivdir(inode)); ++ if (!err) { ++ file->f_version = inode->i_version; ++ if (allocated) ++ au_set_fvdir_cache(file, allocated); ++ } else if (allocated) ++ au_vdir_free(allocated); ++ ++out: ++ return err; ++} ++ ++static loff_t calc_offset(struct au_vdir *vdir) ++{ ++ loff_t offset; ++ union au_vdir_deblk_p p; ++ ++ p.deblk = vdir->vd_deblk[vdir->vd_last.ul]; ++ offset = vdir->vd_last.p.deblk - p.deblk; ++ offset += vdir->vd_deblk_sz * vdir->vd_last.ul; ++ return offset; ++} ++ ++/* returns true or false */ ++static int seek_vdir(struct file *file, struct dir_context *ctx) ++{ ++ int valid; ++ unsigned int deblk_sz; ++ unsigned long ul, n; ++ loff_t offset; ++ union au_vdir_deblk_p p, deblk_end; ++ struct au_vdir *vdir_cache; ++ ++ valid = 1; ++ vdir_cache = au_fvdir_cache(file); ++ offset = calc_offset(vdir_cache); ++ AuDbg("offset %lld\n", offset); ++ if (ctx->pos == offset) ++ goto out; ++ ++ vdir_cache->vd_last.ul = 0; ++ vdir_cache->vd_last.p.deblk = vdir_cache->vd_deblk[0]; ++ if (!ctx->pos) ++ goto out; ++ ++ valid = 0; ++ deblk_sz = vdir_cache->vd_deblk_sz; ++ ul = div64_u64(ctx->pos, deblk_sz); ++ AuDbg("ul %lu\n", ul); ++ if (ul >= vdir_cache->vd_nblk) ++ goto out; ++ ++ n = vdir_cache->vd_nblk; ++ for (; ul < n; ul++) { ++ p.deblk = vdir_cache->vd_deblk[ul]; ++ deblk_end.deblk = p.deblk + deblk_sz; ++ offset = ul; ++ offset *= deblk_sz; ++ while (!is_deblk_end(&p, &deblk_end) && offset < ctx->pos) { ++ unsigned int l; ++ ++ l = calc_size(p.de->de_str.len); ++ offset += l; ++ p.deblk += l; ++ } ++ if (!is_deblk_end(&p, &deblk_end)) { ++ valid = 1; ++ vdir_cache->vd_last.ul = ul; ++ vdir_cache->vd_last.p = p; ++ break; ++ } ++ } ++ ++out: ++ /* smp_mb(); */ ++ AuTraceErr(!valid); ++ return valid; ++} ++ ++int au_vdir_fill_de(struct file *file, struct dir_context *ctx) ++{ ++ unsigned int l, deblk_sz; ++ union au_vdir_deblk_p deblk_end; ++ struct au_vdir *vdir_cache; ++ struct au_vdir_de *de; ++ ++ vdir_cache = au_fvdir_cache(file); ++ if (!seek_vdir(file, ctx)) ++ return 0; ++ ++ deblk_sz = vdir_cache->vd_deblk_sz; ++ while (1) { ++ deblk_end.deblk = vdir_cache->vd_deblk[vdir_cache->vd_last.ul]; ++ deblk_end.deblk += deblk_sz; ++ while (!is_deblk_end(&vdir_cache->vd_last.p, &deblk_end)) { ++ de = vdir_cache->vd_last.p.de; ++ AuDbg("%.*s, off%lld, i%lu, dt%d\n", ++ de->de_str.len, de->de_str.name, ctx->pos, ++ (unsigned long)de->de_ino, de->de_type); ++ if (unlikely(!dir_emit(ctx, de->de_str.name, ++ de->de_str.len, de->de_ino, ++ de->de_type))) { ++ /* todo: ignore the error caused by udba? */ ++ /* return err; */ ++ return 0; ++ } ++ ++ l = calc_size(de->de_str.len); ++ vdir_cache->vd_last.p.deblk += l; ++ ctx->pos += l; ++ } ++ if (vdir_cache->vd_last.ul < vdir_cache->vd_nblk - 1) { ++ vdir_cache->vd_last.ul++; ++ vdir_cache->vd_last.p.deblk ++ = vdir_cache->vd_deblk[vdir_cache->vd_last.ul]; ++ ctx->pos = deblk_sz * vdir_cache->vd_last.ul; ++ continue; ++ } ++ break; ++ } ++ ++ /* smp_mb(); */ ++ return 0; ++} +diff --git a/fs/aufs/vfsub.c b/fs/aufs/vfsub.c +new file mode 100644 +index 0000000..2d01dd8 +--- /dev/null ++++ b/fs/aufs/vfsub.c +@@ -0,0 +1,884 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * sub-routines for VFS ++ */ ++ ++#include ++#include ++#include ++#include ++#include "../fs/mount.h" ++#include "aufs.h" ++ ++#ifdef CONFIG_AUFS_BR_FUSE ++int vfsub_test_mntns(struct vfsmount *mnt, struct super_block *h_sb) ++{ ++ struct nsproxy *ns; ++ ++ if (!au_test_fuse(h_sb) || !au_userns) ++ return 0; ++ ++ ns = current->nsproxy; ++ /* no {get,put}_nsproxy(ns) */ ++ return real_mount(mnt)->mnt_ns == ns->mnt_ns ? 0 : -EACCES; ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++int vfsub_update_h_iattr(struct path *h_path, int *did) ++{ ++ int err; ++ struct kstat st; ++ struct super_block *h_sb; ++ ++ /* for remote fs, leave work for its getattr or d_revalidate */ ++ /* for bad i_attr fs, handle them in aufs_getattr() */ ++ /* still some fs may acquire i_mutex. we need to skip them */ ++ err = 0; ++ if (!did) ++ did = &err; ++ h_sb = h_path->dentry->d_sb; ++ *did = (!au_test_fs_remote(h_sb) && au_test_fs_refresh_iattr(h_sb)); ++ if (*did) ++ err = vfs_getattr(h_path, &st); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct file *vfsub_dentry_open(struct path *path, int flags) ++{ ++ struct file *file; ++ ++ file = dentry_open(path, flags /* | __FMODE_NONOTIFY */, ++ current_cred()); ++ if (!IS_ERR_OR_NULL(file) ++ && (file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) ++ i_readcount_inc(d_inode(path->dentry)); ++ ++ return file; ++} ++ ++struct file *vfsub_filp_open(const char *path, int oflags, int mode) ++{ ++ struct file *file; ++ ++ lockdep_off(); ++ file = filp_open(path, ++ oflags /* | __FMODE_NONOTIFY */, ++ mode); ++ lockdep_on(); ++ if (IS_ERR(file)) ++ goto out; ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ ++ ++out: ++ return file; ++} ++ ++/* ++ * Ideally this function should call VFS:do_last() in order to keep all its ++ * checkings. But it is very hard for aufs to regenerate several VFS internal ++ * structure such as nameidata. This is a second (or third) best approach. ++ * cf. linux/fs/namei.c:do_last(), lookup_open() and atomic_open(). ++ */ ++int vfsub_atomic_open(struct inode *dir, struct dentry *dentry, ++ struct vfsub_aopen_args *args, struct au_branch *br) ++{ ++ int err; ++ struct file *file = args->file; ++ /* copied from linux/fs/namei.c:atomic_open() */ ++ struct dentry *const DENTRY_NOT_SET = (void *)-1UL; ++ ++ IMustLock(dir); ++ AuDebugOn(!dir->i_op->atomic_open); ++ ++ err = au_br_test_oflag(args->open_flag, br); ++ if (unlikely(err)) ++ goto out; ++ ++ args->file->f_path.dentry = DENTRY_NOT_SET; ++ args->file->f_path.mnt = au_br_mnt(br); ++ err = dir->i_op->atomic_open(dir, dentry, file, args->open_flag, ++ args->create_mode, args->opened); ++ if (err >= 0) { ++ /* some filesystems don't set FILE_CREATED while succeeded? */ ++ if (*args->opened & FILE_CREATED) ++ fsnotify_create(dir, dentry); ++ } else ++ goto out; ++ ++ ++ if (!err) { ++ /* todo: call VFS:may_open() here */ ++ err = open_check_o_direct(file); ++ /* todo: ima_file_check() too? */ ++ if (!err && (args->open_flag & __FMODE_EXEC)) ++ err = deny_write_access(file); ++ if (unlikely(err)) ++ /* note that the file is created and still opened */ ++ goto out; ++ } ++ ++ au_br_get(br); ++ fsnotify_open(file); ++ ++out: ++ return err; ++} ++ ++int vfsub_kern_path(const char *name, unsigned int flags, struct path *path) ++{ ++ int err; ++ ++ err = kern_path(name, flags, path); ++ if (!err && d_is_positive(path->dentry)) ++ vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++struct dentry *vfsub_lookup_one_len_unlocked(const char *name, ++ struct dentry *parent, int len) ++{ ++ struct path path = { ++ .mnt = NULL ++ }; ++ ++ path.dentry = lookup_one_len_unlocked(name, parent, len); ++ if (IS_ERR(path.dentry)) ++ goto out; ++ if (d_is_positive(path.dentry)) ++ vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/ ++ ++out: ++ AuTraceErrPtr(path.dentry); ++ return path.dentry; ++} ++ ++struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent, ++ int len) ++{ ++ struct path path = { ++ .mnt = NULL ++ }; ++ ++ /* VFS checks it too, but by WARN_ON_ONCE() */ ++ IMustLock(d_inode(parent)); ++ ++ path.dentry = lookup_one_len(name, parent, len); ++ if (IS_ERR(path.dentry)) ++ goto out; ++ if (d_is_positive(path.dentry)) ++ vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/ ++ ++out: ++ AuTraceErrPtr(path.dentry); ++ return path.dentry; ++} ++ ++void vfsub_call_lkup_one(void *args) ++{ ++ struct vfsub_lkup_one_args *a = args; ++ *a->errp = vfsub_lkup_one(a->name, a->parent); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1, ++ struct dentry *d2, struct au_hinode *hdir2) ++{ ++ struct dentry *d; ++ ++ lockdep_off(); ++ d = lock_rename(d1, d2); ++ lockdep_on(); ++ au_hn_suspend(hdir1); ++ if (hdir1 != hdir2) ++ au_hn_suspend(hdir2); ++ ++ return d; ++} ++ ++void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1, ++ struct dentry *d2, struct au_hinode *hdir2) ++{ ++ au_hn_resume(hdir1); ++ if (hdir1 != hdir2) ++ au_hn_resume(hdir2); ++ lockdep_off(); ++ unlock_rename(d1, d2); ++ lockdep_on(); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int vfsub_create(struct inode *dir, struct path *path, int mode, bool want_excl) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_mknod(path, d, mode, 0); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ lockdep_off(); ++ err = vfs_create(dir, path->dentry, mode, want_excl); ++ lockdep_on(); ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++int vfsub_symlink(struct inode *dir, struct path *path, const char *symname) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_symlink(path, d, symname); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ lockdep_off(); ++ err = vfs_symlink(dir, path->dentry, symname); ++ lockdep_on(); ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_mknod(path, d, mode, new_encode_dev(dev)); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ lockdep_off(); ++ err = vfs_mknod(dir, path->dentry, mode, dev); ++ lockdep_on(); ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++static int au_test_nlink(struct inode *inode) ++{ ++ const unsigned int link_max = UINT_MAX >> 1; /* rough margin */ ++ ++ if (!au_test_fs_no_limit_nlink(inode->i_sb) ++ || inode->i_nlink < link_max) ++ return 0; ++ return -EMLINK; ++} ++ ++int vfsub_link(struct dentry *src_dentry, struct inode *dir, struct path *path, ++ struct inode **delegated_inode) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ err = au_test_nlink(d_inode(src_dentry)); ++ if (unlikely(err)) ++ return err; ++ ++ /* we don't call may_linkat() */ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_link(src_dentry, path, d); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ lockdep_off(); ++ err = vfs_link(src_dentry, dir, path->dentry, delegated_inode); ++ lockdep_on(); ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ /* fuse has different memory inode for the same inumber */ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ tmp.dentry = src_dentry; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++int vfsub_rename(struct inode *src_dir, struct dentry *src_dentry, ++ struct inode *dir, struct path *path, ++ struct inode **delegated_inode) ++{ ++ int err; ++ struct path tmp = { ++ .mnt = path->mnt ++ }; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ IMustLock(src_dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ tmp.dentry = src_dentry->d_parent; ++ err = security_path_rename(&tmp, src_dentry, path, d, /*flags*/0); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ lockdep_off(); ++ err = vfs_rename(src_dir, src_dentry, dir, path->dentry, ++ delegated_inode, /*flags*/0); ++ lockdep_on(); ++ if (!err) { ++ int did; ++ ++ tmp.dentry = d->d_parent; ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = src_dentry; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ tmp.dentry = src_dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++int vfsub_mkdir(struct inode *dir, struct path *path, int mode) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_mkdir(path, d, mode); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ lockdep_off(); ++ err = vfs_mkdir(dir, path->dentry, mode); ++ lockdep_on(); ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++int vfsub_rmdir(struct inode *dir, struct path *path) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_rmdir(path, d); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ lockdep_off(); ++ err = vfs_rmdir(dir, path->dentry); ++ lockdep_on(); ++ if (!err) { ++ struct path tmp = { ++ .dentry = path->dentry->d_parent, ++ .mnt = path->mnt ++ }; ++ ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* todo: support mmap_sem? */ ++ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count, ++ loff_t *ppos) ++{ ++ ssize_t err; ++ ++ lockdep_off(); ++ err = vfs_read(file, ubuf, count, ppos); ++ lockdep_on(); ++ if (err >= 0) ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++/* todo: kernel_read()? */ ++ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count, ++ loff_t *ppos) ++{ ++ ssize_t err; ++ mm_segment_t oldfs; ++ union { ++ void *k; ++ char __user *u; ++ } buf; ++ ++ buf.k = kbuf; ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ err = vfsub_read_u(file, buf.u, count, ppos); ++ set_fs(oldfs); ++ return err; ++} ++ ++ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count, ++ loff_t *ppos) ++{ ++ ssize_t err; ++ ++ lockdep_off(); ++ err = vfs_write(file, ubuf, count, ppos); ++ lockdep_on(); ++ if (err >= 0) ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, loff_t *ppos) ++{ ++ ssize_t err; ++ mm_segment_t oldfs; ++ union { ++ void *k; ++ const char __user *u; ++ } buf; ++ ++ buf.k = kbuf; ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ err = vfsub_write_u(file, buf.u, count, ppos); ++ set_fs(oldfs); ++ return err; ++} ++ ++int vfsub_flush(struct file *file, fl_owner_t id) ++{ ++ int err; ++ ++ err = 0; ++ if (file->f_op->flush) { ++ if (!au_test_nfs(file->f_path.dentry->d_sb)) ++ err = file->f_op->flush(file, id); ++ else { ++ lockdep_off(); ++ err = file->f_op->flush(file, id); ++ lockdep_on(); ++ } ++ if (!err) ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); ++ /*ignore*/ ++ } ++ return err; ++} ++ ++int vfsub_iterate_dir(struct file *file, struct dir_context *ctx) ++{ ++ int err; ++ ++ AuDbg("%pD, ctx{%pf, %llu}\n", file, ctx->actor, ctx->pos); ++ ++ lockdep_off(); ++ err = iterate_dir(file, ctx); ++ lockdep_on(); ++ if (err >= 0) ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++long vfsub_splice_to(struct file *in, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags) ++{ ++ long err; ++ ++ lockdep_off(); ++ err = do_splice_to(in, ppos, pipe, len, flags); ++ lockdep_on(); ++ file_accessed(in); ++ if (err >= 0) ++ vfsub_update_h_iattr(&in->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out, ++ loff_t *ppos, size_t len, unsigned int flags) ++{ ++ long err; ++ ++ lockdep_off(); ++ err = do_splice_from(pipe, out, ppos, len, flags); ++ lockdep_on(); ++ if (err >= 0) ++ vfsub_update_h_iattr(&out->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++int vfsub_fsync(struct file *file, struct path *path, int datasync) ++{ ++ int err; ++ ++ /* file can be NULL */ ++ lockdep_off(); ++ err = vfs_fsync(file, datasync); ++ lockdep_on(); ++ if (!err) { ++ if (!path) { ++ AuDebugOn(!file); ++ path = &file->f_path; ++ } ++ vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/ ++ } ++ return err; ++} ++ ++/* cf. open.c:do_sys_truncate() and do_sys_ftruncate() */ ++int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr, ++ struct file *h_file) ++{ ++ int err; ++ struct inode *h_inode; ++ struct super_block *h_sb; ++ ++ if (!h_file) { ++ err = vfsub_truncate(h_path, length); ++ goto out; ++ } ++ ++ h_inode = d_inode(h_path->dentry); ++ h_sb = h_inode->i_sb; ++ lockdep_off(); ++ sb_start_write(h_sb); ++ lockdep_on(); ++ err = locks_verify_truncate(h_inode, h_file, length); ++ if (!err) ++ err = security_path_truncate(h_path); ++ if (!err) { ++ lockdep_off(); ++ err = do_truncate(h_path->dentry, length, attr, h_file); ++ lockdep_on(); ++ } ++ lockdep_off(); ++ sb_end_write(h_sb); ++ lockdep_on(); ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_vfsub_mkdir_args { ++ int *errp; ++ struct inode *dir; ++ struct path *path; ++ int mode; ++}; ++ ++static void au_call_vfsub_mkdir(void *args) ++{ ++ struct au_vfsub_mkdir_args *a = args; ++ *a->errp = vfsub_mkdir(a->dir, a->path, a->mode); ++} ++ ++int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode) ++{ ++ int err, do_sio, wkq_err; ++ ++ do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE); ++ if (!do_sio) { ++ lockdep_off(); ++ err = vfsub_mkdir(dir, path, mode); ++ lockdep_on(); ++ } else { ++ struct au_vfsub_mkdir_args args = { ++ .errp = &err, ++ .dir = dir, ++ .path = path, ++ .mode = mode ++ }; ++ wkq_err = au_wkq_wait(au_call_vfsub_mkdir, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ return err; ++} ++ ++struct au_vfsub_rmdir_args { ++ int *errp; ++ struct inode *dir; ++ struct path *path; ++}; ++ ++static void au_call_vfsub_rmdir(void *args) ++{ ++ struct au_vfsub_rmdir_args *a = args; ++ *a->errp = vfsub_rmdir(a->dir, a->path); ++} ++ ++int vfsub_sio_rmdir(struct inode *dir, struct path *path) ++{ ++ int err, do_sio, wkq_err; ++ ++ do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE); ++ if (!do_sio) { ++ lockdep_off(); ++ err = vfsub_rmdir(dir, path); ++ lockdep_on(); ++ } else { ++ struct au_vfsub_rmdir_args args = { ++ .errp = &err, ++ .dir = dir, ++ .path = path ++ }; ++ wkq_err = au_wkq_wait(au_call_vfsub_rmdir, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct notify_change_args { ++ int *errp; ++ struct path *path; ++ struct iattr *ia; ++ struct inode **delegated_inode; ++}; ++ ++static void call_notify_change(void *args) ++{ ++ struct notify_change_args *a = args; ++ struct inode *h_inode; ++ ++ h_inode = d_inode(a->path->dentry); ++ IMustLock(h_inode); ++ ++ *a->errp = -EPERM; ++ if (!IS_IMMUTABLE(h_inode) && !IS_APPEND(h_inode)) { ++ lockdep_off(); ++ *a->errp = notify_change(a->path->dentry, a->ia, ++ a->delegated_inode); ++ lockdep_on(); ++ if (!*a->errp) ++ vfsub_update_h_iattr(a->path, /*did*/NULL); /*ignore*/ ++ } ++ AuTraceErr(*a->errp); ++} ++ ++int vfsub_notify_change(struct path *path, struct iattr *ia, ++ struct inode **delegated_inode) ++{ ++ int err; ++ struct notify_change_args args = { ++ .errp = &err, ++ .path = path, ++ .ia = ia, ++ .delegated_inode = delegated_inode ++ }; ++ ++ call_notify_change(&args); ++ ++ return err; ++} ++ ++int vfsub_sio_notify_change(struct path *path, struct iattr *ia, ++ struct inode **delegated_inode) ++{ ++ int err, wkq_err; ++ struct notify_change_args args = { ++ .errp = &err, ++ .path = path, ++ .ia = ia, ++ .delegated_inode = delegated_inode ++ }; ++ ++ wkq_err = au_wkq_wait(call_notify_change, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct unlink_args { ++ int *errp; ++ struct inode *dir; ++ struct path *path; ++ struct inode **delegated_inode; ++}; ++ ++static void call_unlink(void *args) ++{ ++ struct unlink_args *a = args; ++ struct dentry *d = a->path->dentry; ++ struct inode *h_inode; ++ const int stop_sillyrename = (au_test_nfs(d->d_sb) ++ && au_dcount(d) == 1); ++ ++ IMustLock(a->dir); ++ ++ a->path->dentry = d->d_parent; ++ *a->errp = security_path_unlink(a->path, d); ++ a->path->dentry = d; ++ if (unlikely(*a->errp)) ++ return; ++ ++ if (!stop_sillyrename) ++ dget(d); ++ h_inode = NULL; ++ if (d_is_positive(d)) { ++ h_inode = d_inode(d); ++ ihold(h_inode); ++ } ++ ++ lockdep_off(); ++ *a->errp = vfs_unlink(a->dir, d, a->delegated_inode); ++ lockdep_on(); ++ if (!*a->errp) { ++ struct path tmp = { ++ .dentry = d->d_parent, ++ .mnt = a->path->mnt ++ }; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/ ++ } ++ ++ if (!stop_sillyrename) ++ dput(d); ++ if (h_inode) ++ iput(h_inode); ++ ++ AuTraceErr(*a->errp); ++} ++ ++/* ++ * @dir: must be locked. ++ * @dentry: target dentry. ++ */ ++int vfsub_unlink(struct inode *dir, struct path *path, ++ struct inode **delegated_inode, int force) ++{ ++ int err; ++ struct unlink_args args = { ++ .errp = &err, ++ .dir = dir, ++ .path = path, ++ .delegated_inode = delegated_inode ++ }; ++ ++ if (!force) ++ call_unlink(&args); ++ else { ++ int wkq_err; ++ ++ wkq_err = au_wkq_wait(call_unlink, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ return err; ++} +diff --git a/fs/aufs/vfsub.h b/fs/aufs/vfsub.h +new file mode 100644 +index 0000000..b0f1c53 +--- /dev/null ++++ b/fs/aufs/vfsub.h +@@ -0,0 +1,310 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * sub-routines for VFS ++ */ ++ ++#ifndef __AUFS_VFSUB_H__ ++#define __AUFS_VFSUB_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include ++#include ++#include "debug.h" ++ ++/* copied from linux/fs/internal.h */ ++/* todo: BAD approach!! */ ++extern void __mnt_drop_write(struct vfsmount *); ++extern int open_check_o_direct(struct file *f); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* lock subclass for lower inode */ ++/* default MAX_LOCKDEP_SUBCLASSES(8) is not enough */ ++/* reduce? gave up. */ ++enum { ++ AuLsc_I_Begin = I_MUTEX_PARENT2, /* 5 */ ++ AuLsc_I_PARENT, /* lower inode, parent first */ ++ AuLsc_I_PARENT2, /* copyup dirs */ ++ AuLsc_I_PARENT3, /* copyup wh */ ++ AuLsc_I_CHILD, ++ AuLsc_I_CHILD2, ++ AuLsc_I_End ++}; ++ ++/* to debug easier, do not make them inlined functions */ ++#define MtxMustLock(mtx) AuDebugOn(!mutex_is_locked(mtx)) ++#define IMustLock(i) AuDebugOn(!inode_is_locked(i)) ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline void vfsub_drop_nlink(struct inode *inode) ++{ ++ AuDebugOn(!inode->i_nlink); ++ drop_nlink(inode); ++} ++ ++static inline void vfsub_dead_dir(struct inode *inode) ++{ ++ AuDebugOn(!S_ISDIR(inode->i_mode)); ++ inode->i_flags |= S_DEAD; ++ clear_nlink(inode); ++} ++ ++static inline int vfsub_native_ro(struct inode *inode) ++{ ++ return (inode->i_sb->s_flags & MS_RDONLY) ++ || IS_RDONLY(inode) ++ /* || IS_APPEND(inode) */ ++ || IS_IMMUTABLE(inode); ++} ++ ++#ifdef CONFIG_AUFS_BR_FUSE ++int vfsub_test_mntns(struct vfsmount *mnt, struct super_block *h_sb); ++#else ++AuStubInt0(vfsub_test_mntns, struct vfsmount *mnt, struct super_block *h_sb); ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++int vfsub_update_h_iattr(struct path *h_path, int *did); ++struct file *vfsub_dentry_open(struct path *path, int flags); ++struct file *vfsub_filp_open(const char *path, int oflags, int mode); ++struct vfsub_aopen_args { ++ struct file *file; ++ unsigned int open_flag; ++ umode_t create_mode; ++ int *opened; ++}; ++struct au_branch; ++int vfsub_atomic_open(struct inode *dir, struct dentry *dentry, ++ struct vfsub_aopen_args *args, struct au_branch *br); ++int vfsub_kern_path(const char *name, unsigned int flags, struct path *path); ++ ++struct dentry *vfsub_lookup_one_len_unlocked(const char *name, ++ struct dentry *parent, int len); ++struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent, ++ int len); ++ ++struct vfsub_lkup_one_args { ++ struct dentry **errp; ++ struct qstr *name; ++ struct dentry *parent; ++}; ++ ++static inline struct dentry *vfsub_lkup_one(struct qstr *name, ++ struct dentry *parent) ++{ ++ return vfsub_lookup_one_len(name->name, parent, name->len); ++} ++ ++void vfsub_call_lkup_one(void *args); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline int vfsub_mnt_want_write(struct vfsmount *mnt) ++{ ++ int err; ++ ++ lockdep_off(); ++ err = mnt_want_write(mnt); ++ lockdep_on(); ++ return err; ++} ++ ++static inline void vfsub_mnt_drop_write(struct vfsmount *mnt) ++{ ++ lockdep_off(); ++ mnt_drop_write(mnt); ++ lockdep_on(); ++} ++ ++#if 0 /* reserved */ ++static inline void vfsub_mnt_drop_write_file(struct file *file) ++{ ++ lockdep_off(); ++ mnt_drop_write_file(file); ++ lockdep_on(); ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_hinode; ++struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1, ++ struct dentry *d2, struct au_hinode *hdir2); ++void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1, ++ struct dentry *d2, struct au_hinode *hdir2); ++ ++int vfsub_create(struct inode *dir, struct path *path, int mode, ++ bool want_excl); ++int vfsub_symlink(struct inode *dir, struct path *path, ++ const char *symname); ++int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev); ++int vfsub_link(struct dentry *src_dentry, struct inode *dir, ++ struct path *path, struct inode **delegated_inode); ++int vfsub_rename(struct inode *src_hdir, struct dentry *src_dentry, ++ struct inode *hdir, struct path *path, ++ struct inode **delegated_inode); ++int vfsub_mkdir(struct inode *dir, struct path *path, int mode); ++int vfsub_rmdir(struct inode *dir, struct path *path); ++ ++/* ---------------------------------------------------------------------- */ ++ ++ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count, ++ loff_t *ppos); ++ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count, ++ loff_t *ppos); ++ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count, ++ loff_t *ppos); ++ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, ++ loff_t *ppos); ++int vfsub_flush(struct file *file, fl_owner_t id); ++int vfsub_iterate_dir(struct file *file, struct dir_context *ctx); ++ ++static inline loff_t vfsub_f_size_read(struct file *file) ++{ ++ return i_size_read(file_inode(file)); ++} ++ ++static inline unsigned int vfsub_file_flags(struct file *file) ++{ ++ unsigned int flags; ++ ++ spin_lock(&file->f_lock); ++ flags = file->f_flags; ++ spin_unlock(&file->f_lock); ++ ++ return flags; ++} ++ ++#if 0 /* reserved */ ++static inline void vfsub_file_accessed(struct file *h_file) ++{ ++ file_accessed(h_file); ++ vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); /*ignore*/ ++} ++#endif ++ ++#if 0 /* reserved */ ++static inline void vfsub_touch_atime(struct vfsmount *h_mnt, ++ struct dentry *h_dentry) ++{ ++ struct path h_path = { ++ .dentry = h_dentry, ++ .mnt = h_mnt ++ }; ++ touch_atime(&h_path); ++ vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/ ++} ++#endif ++ ++static inline int vfsub_update_time(struct inode *h_inode, struct timespec *ts, ++ int flags) ++{ ++ return generic_update_time(h_inode, ts, flags); ++ /* no vfsub_update_h_iattr() since we don't have struct path */ ++} ++ ++#ifdef CONFIG_FS_POSIX_ACL ++static inline int vfsub_acl_chmod(struct inode *h_inode, umode_t h_mode) ++{ ++ int err; ++ ++ err = posix_acl_chmod(h_inode, h_mode); ++ if (err == -EOPNOTSUPP) ++ err = 0; ++ return err; ++} ++#else ++AuStubInt0(vfsub_acl_chmod, struct inode *h_inode, umode_t h_mode); ++#endif ++ ++long vfsub_splice_to(struct file *in, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags); ++long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out, ++ loff_t *ppos, size_t len, unsigned int flags); ++ ++static inline long vfsub_truncate(struct path *path, loff_t length) ++{ ++ long err; ++ ++ lockdep_off(); ++ err = vfs_truncate(path, length); ++ lockdep_on(); ++ return err; ++} ++ ++int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr, ++ struct file *h_file); ++int vfsub_fsync(struct file *file, struct path *path, int datasync); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline loff_t vfsub_llseek(struct file *file, loff_t offset, int origin) ++{ ++ loff_t err; ++ ++ lockdep_off(); ++ err = vfs_llseek(file, offset, origin); ++ lockdep_on(); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode); ++int vfsub_sio_rmdir(struct inode *dir, struct path *path); ++int vfsub_sio_notify_change(struct path *path, struct iattr *ia, ++ struct inode **delegated_inode); ++int vfsub_notify_change(struct path *path, struct iattr *ia, ++ struct inode **delegated_inode); ++int vfsub_unlink(struct inode *dir, struct path *path, ++ struct inode **delegated_inode, int force); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline int vfsub_setxattr(struct dentry *dentry, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ int err; ++ ++ lockdep_off(); ++ err = vfs_setxattr(dentry, name, value, size, flags); ++ lockdep_on(); ++ ++ return err; ++} ++ ++static inline int vfsub_removexattr(struct dentry *dentry, const char *name) ++{ ++ int err; ++ ++ lockdep_off(); ++ err = vfs_removexattr(dentry, name); ++ lockdep_on(); ++ ++ return err; ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_VFSUB_H__ */ +diff --git a/fs/aufs/wbr_policy.c b/fs/aufs/wbr_policy.c +new file mode 100644 +index 0000000..22f4cef +--- /dev/null ++++ b/fs/aufs/wbr_policy.c +@@ -0,0 +1,765 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * policies for selecting one among multiple writable branches ++ */ ++ ++#include ++#include "aufs.h" ++ ++/* subset of cpup_attr() */ ++static noinline_for_stack ++int au_cpdown_attr(struct path *h_path, struct dentry *h_src) ++{ ++ int err, sbits; ++ struct iattr ia; ++ struct inode *h_isrc; ++ ++ h_isrc = d_inode(h_src); ++ ia.ia_valid = ATTR_FORCE | ATTR_MODE | ATTR_UID | ATTR_GID; ++ ia.ia_mode = h_isrc->i_mode; ++ ia.ia_uid = h_isrc->i_uid; ++ ia.ia_gid = h_isrc->i_gid; ++ sbits = !!(ia.ia_mode & (S_ISUID | S_ISGID)); ++ au_cpup_attr_flags(d_inode(h_path->dentry), h_isrc->i_flags); ++ /* no delegation since it is just created */ ++ err = vfsub_sio_notify_change(h_path, &ia, /*delegated*/NULL); ++ ++ /* is this nfs only? */ ++ if (!err && sbits && au_test_nfs(h_path->dentry->d_sb)) { ++ ia.ia_valid = ATTR_FORCE | ATTR_MODE; ++ ia.ia_mode = h_isrc->i_mode; ++ err = vfsub_sio_notify_change(h_path, &ia, /*delegated*/NULL); ++ } ++ ++ return err; ++} ++ ++#define AuCpdown_PARENT_OPQ 1 ++#define AuCpdown_WHED (1 << 1) ++#define AuCpdown_MADE_DIR (1 << 2) ++#define AuCpdown_DIROPQ (1 << 3) ++#define au_ftest_cpdown(flags, name) ((flags) & AuCpdown_##name) ++#define au_fset_cpdown(flags, name) \ ++ do { (flags) |= AuCpdown_##name; } while (0) ++#define au_fclr_cpdown(flags, name) \ ++ do { (flags) &= ~AuCpdown_##name; } while (0) ++ ++static int au_cpdown_dir_opq(struct dentry *dentry, aufs_bindex_t bdst, ++ unsigned int *flags) ++{ ++ int err; ++ struct dentry *opq_dentry; ++ ++ opq_dentry = au_diropq_create(dentry, bdst); ++ err = PTR_ERR(opq_dentry); ++ if (IS_ERR(opq_dentry)) ++ goto out; ++ dput(opq_dentry); ++ au_fset_cpdown(*flags, DIROPQ); ++ ++out: ++ return err; ++} ++ ++static int au_cpdown_dir_wh(struct dentry *dentry, struct dentry *h_parent, ++ struct inode *dir, aufs_bindex_t bdst) ++{ ++ int err; ++ struct path h_path; ++ struct au_branch *br; ++ ++ br = au_sbr(dentry->d_sb, bdst); ++ h_path.dentry = au_wh_lkup(h_parent, &dentry->d_name, br); ++ err = PTR_ERR(h_path.dentry); ++ if (IS_ERR(h_path.dentry)) ++ goto out; ++ ++ err = 0; ++ if (d_is_positive(h_path.dentry)) { ++ h_path.mnt = au_br_mnt(br); ++ err = au_wh_unlink_dentry(au_h_iptr(dir, bdst), &h_path, ++ dentry); ++ } ++ dput(h_path.dentry); ++ ++out: ++ return err; ++} ++ ++static int au_cpdown_dir(struct dentry *dentry, aufs_bindex_t bdst, ++ struct au_pin *pin, ++ struct dentry *h_parent, void *arg) ++{ ++ int err, rerr; ++ aufs_bindex_t bopq, btop; ++ struct path h_path; ++ struct dentry *parent; ++ struct inode *h_dir, *h_inode, *inode, *dir; ++ unsigned int *flags = arg; ++ ++ btop = au_dbtop(dentry); ++ /* dentry is di-locked */ ++ parent = dget_parent(dentry); ++ dir = d_inode(parent); ++ h_dir = d_inode(h_parent); ++ AuDebugOn(h_dir != au_h_iptr(dir, bdst)); ++ IMustLock(h_dir); ++ ++ err = au_lkup_neg(dentry, bdst, /*wh*/0); ++ if (unlikely(err < 0)) ++ goto out; ++ h_path.dentry = au_h_dptr(dentry, bdst); ++ h_path.mnt = au_sbr_mnt(dentry->d_sb, bdst); ++ err = vfsub_sio_mkdir(au_h_iptr(dir, bdst), &h_path, ++ S_IRWXU | S_IRUGO | S_IXUGO); ++ if (unlikely(err)) ++ goto out_put; ++ au_fset_cpdown(*flags, MADE_DIR); ++ ++ bopq = au_dbdiropq(dentry); ++ au_fclr_cpdown(*flags, WHED); ++ au_fclr_cpdown(*flags, DIROPQ); ++ if (au_dbwh(dentry) == bdst) ++ au_fset_cpdown(*flags, WHED); ++ if (!au_ftest_cpdown(*flags, PARENT_OPQ) && bopq <= bdst) ++ au_fset_cpdown(*flags, PARENT_OPQ); ++ h_inode = d_inode(h_path.dentry); ++ inode_lock_nested(h_inode, AuLsc_I_CHILD); ++ if (au_ftest_cpdown(*flags, WHED)) { ++ err = au_cpdown_dir_opq(dentry, bdst, flags); ++ if (unlikely(err)) { ++ inode_unlock(h_inode); ++ goto out_dir; ++ } ++ } ++ ++ err = au_cpdown_attr(&h_path, au_h_dptr(dentry, btop)); ++ inode_unlock(h_inode); ++ if (unlikely(err)) ++ goto out_opq; ++ ++ if (au_ftest_cpdown(*flags, WHED)) { ++ err = au_cpdown_dir_wh(dentry, h_parent, dir, bdst); ++ if (unlikely(err)) ++ goto out_opq; ++ } ++ ++ inode = d_inode(dentry); ++ if (au_ibbot(inode) < bdst) ++ au_set_ibbot(inode, bdst); ++ au_set_h_iptr(inode, bdst, au_igrab(h_inode), ++ au_hi_flags(inode, /*isdir*/1)); ++ au_fhsm_wrote(dentry->d_sb, bdst, /*force*/0); ++ goto out; /* success */ ++ ++ /* revert */ ++out_opq: ++ if (au_ftest_cpdown(*flags, DIROPQ)) { ++ inode_lock_nested(h_inode, AuLsc_I_CHILD); ++ rerr = au_diropq_remove(dentry, bdst); ++ inode_unlock(h_inode); ++ if (unlikely(rerr)) { ++ AuIOErr("failed removing diropq for %pd b%d (%d)\n", ++ dentry, bdst, rerr); ++ err = -EIO; ++ goto out; ++ } ++ } ++out_dir: ++ if (au_ftest_cpdown(*flags, MADE_DIR)) { ++ rerr = vfsub_sio_rmdir(au_h_iptr(dir, bdst), &h_path); ++ if (unlikely(rerr)) { ++ AuIOErr("failed removing %pd b%d (%d)\n", ++ dentry, bdst, rerr); ++ err = -EIO; ++ } ++ } ++out_put: ++ au_set_h_dptr(dentry, bdst, NULL); ++ if (au_dbbot(dentry) == bdst) ++ au_update_dbbot(dentry); ++out: ++ dput(parent); ++ return err; ++} ++ ++int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst) ++{ ++ int err; ++ unsigned int flags; ++ ++ flags = 0; ++ err = au_cp_dirs(dentry, bdst, au_cpdown_dir, &flags); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* policies for create */ ++ ++int au_wbr_nonopq(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ int err, i, j, ndentry; ++ aufs_bindex_t bopq; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry **dentries, *parent, *d; ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ parent = dget_parent(dentry); ++ err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/0); ++ if (unlikely(err)) ++ goto out_free; ++ ++ err = bindex; ++ for (i = 0; i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ dentries = dpage->dentries; ++ ndentry = dpage->ndentry; ++ for (j = 0; j < ndentry; j++) { ++ d = dentries[j]; ++ di_read_lock_parent2(d, !AuLock_IR); ++ bopq = au_dbdiropq(d); ++ di_read_unlock(d, !AuLock_IR); ++ if (bopq >= 0 && bopq < err) ++ err = bopq; ++ } ++ } ++ ++out_free: ++ dput(parent); ++ au_dpages_free(&dpages); ++out: ++ return err; ++} ++ ++static int au_wbr_bu(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ for (; bindex >= 0; bindex--) ++ if (!au_br_rdonly(au_sbr(sb, bindex))) ++ return bindex; ++ return -EROFS; ++} ++ ++/* top down parent */ ++static int au_wbr_create_tdp(struct dentry *dentry, ++ unsigned int flags __maybe_unused) ++{ ++ int err; ++ aufs_bindex_t btop, bindex; ++ struct super_block *sb; ++ struct dentry *parent, *h_parent; ++ ++ sb = dentry->d_sb; ++ btop = au_dbtop(dentry); ++ err = btop; ++ if (!au_br_rdonly(au_sbr(sb, btop))) ++ goto out; ++ ++ err = -EROFS; ++ parent = dget_parent(dentry); ++ for (bindex = au_dbtop(parent); bindex < btop; bindex++) { ++ h_parent = au_h_dptr(parent, bindex); ++ if (!h_parent || d_is_negative(h_parent)) ++ continue; ++ ++ if (!au_br_rdonly(au_sbr(sb, bindex))) { ++ err = bindex; ++ break; ++ } ++ } ++ dput(parent); ++ ++ /* bottom up here */ ++ if (unlikely(err < 0)) { ++ err = au_wbr_bu(sb, btop - 1); ++ if (err >= 0) ++ err = au_wbr_nonopq(dentry, err); ++ } ++ ++out: ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* an exception for the policy other than tdp */ ++static int au_wbr_create_exp(struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bwh, bdiropq; ++ struct dentry *parent; ++ ++ err = -1; ++ bwh = au_dbwh(dentry); ++ parent = dget_parent(dentry); ++ bdiropq = au_dbdiropq(parent); ++ if (bwh >= 0) { ++ if (bdiropq >= 0) ++ err = min(bdiropq, bwh); ++ else ++ err = bwh; ++ AuDbg("%d\n", err); ++ } else if (bdiropq >= 0) { ++ err = bdiropq; ++ AuDbg("%d\n", err); ++ } ++ dput(parent); ++ ++ if (err >= 0) ++ err = au_wbr_nonopq(dentry, err); ++ ++ if (err >= 0 && au_br_rdonly(au_sbr(dentry->d_sb, err))) ++ err = -1; ++ ++ AuDbg("%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* round robin */ ++static int au_wbr_create_init_rr(struct super_block *sb) ++{ ++ int err; ++ ++ err = au_wbr_bu(sb, au_sbbot(sb)); ++ atomic_set(&au_sbi(sb)->si_wbr_rr_next, -err); /* less important */ ++ /* smp_mb(); */ ++ ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++static int au_wbr_create_rr(struct dentry *dentry, unsigned int flags) ++{ ++ int err, nbr; ++ unsigned int u; ++ aufs_bindex_t bindex, bbot; ++ struct super_block *sb; ++ atomic_t *next; ++ ++ err = au_wbr_create_exp(dentry); ++ if (err >= 0) ++ goto out; ++ ++ sb = dentry->d_sb; ++ next = &au_sbi(sb)->si_wbr_rr_next; ++ bbot = au_sbbot(sb); ++ nbr = bbot + 1; ++ for (bindex = 0; bindex <= bbot; bindex++) { ++ if (!au_ftest_wbr(flags, DIR)) { ++ err = atomic_dec_return(next) + 1; ++ /* modulo for 0 is meaningless */ ++ if (unlikely(!err)) ++ err = atomic_dec_return(next) + 1; ++ } else ++ err = atomic_read(next); ++ AuDbg("%d\n", err); ++ u = err; ++ err = u % nbr; ++ AuDbg("%d\n", err); ++ if (!au_br_rdonly(au_sbr(sb, err))) ++ break; ++ err = -EROFS; ++ } ++ ++ if (err >= 0) ++ err = au_wbr_nonopq(dentry, err); ++ ++out: ++ AuDbg("%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* most free space */ ++static void au_mfs(struct dentry *dentry, struct dentry *parent) ++{ ++ struct super_block *sb; ++ struct au_branch *br; ++ struct au_wbr_mfs *mfs; ++ struct dentry *h_parent; ++ aufs_bindex_t bindex, bbot; ++ int err; ++ unsigned long long b, bavail; ++ struct path h_path; ++ /* reduce the stack usage */ ++ struct kstatfs *st; ++ ++ st = kmalloc(sizeof(*st), GFP_NOFS); ++ if (unlikely(!st)) { ++ AuWarn1("failed updating mfs(%d), ignored\n", -ENOMEM); ++ return; ++ } ++ ++ bavail = 0; ++ sb = dentry->d_sb; ++ mfs = &au_sbi(sb)->si_wbr_mfs; ++ MtxMustLock(&mfs->mfs_lock); ++ mfs->mfs_bindex = -EROFS; ++ mfs->mfsrr_bytes = 0; ++ if (!parent) { ++ bindex = 0; ++ bbot = au_sbbot(sb); ++ } else { ++ bindex = au_dbtop(parent); ++ bbot = au_dbtaildir(parent); ++ } ++ ++ for (; bindex <= bbot; bindex++) { ++ if (parent) { ++ h_parent = au_h_dptr(parent, bindex); ++ if (!h_parent || d_is_negative(h_parent)) ++ continue; ++ } ++ br = au_sbr(sb, bindex); ++ if (au_br_rdonly(br)) ++ continue; ++ ++ /* sb->s_root for NFS is unreliable */ ++ h_path.mnt = au_br_mnt(br); ++ h_path.dentry = h_path.mnt->mnt_root; ++ err = vfs_statfs(&h_path, st); ++ if (unlikely(err)) { ++ AuWarn1("failed statfs, b%d, %d\n", bindex, err); ++ continue; ++ } ++ ++ /* when the available size is equal, select the lower one */ ++ BUILD_BUG_ON(sizeof(b) < sizeof(st->f_bavail) ++ || sizeof(b) < sizeof(st->f_bsize)); ++ b = st->f_bavail * st->f_bsize; ++ br->br_wbr->wbr_bytes = b; ++ if (b >= bavail) { ++ bavail = b; ++ mfs->mfs_bindex = bindex; ++ mfs->mfs_jiffy = jiffies; ++ } ++ } ++ ++ mfs->mfsrr_bytes = bavail; ++ AuDbg("b%d\n", mfs->mfs_bindex); ++ kfree(st); ++} ++ ++static int au_wbr_create_mfs(struct dentry *dentry, unsigned int flags) ++{ ++ int err; ++ struct dentry *parent; ++ struct super_block *sb; ++ struct au_wbr_mfs *mfs; ++ ++ err = au_wbr_create_exp(dentry); ++ if (err >= 0) ++ goto out; ++ ++ sb = dentry->d_sb; ++ parent = NULL; ++ if (au_ftest_wbr(flags, PARENT)) ++ parent = dget_parent(dentry); ++ mfs = &au_sbi(sb)->si_wbr_mfs; ++ mutex_lock(&mfs->mfs_lock); ++ if (time_after(jiffies, mfs->mfs_jiffy + mfs->mfs_expire) ++ || mfs->mfs_bindex < 0 ++ || au_br_rdonly(au_sbr(sb, mfs->mfs_bindex))) ++ au_mfs(dentry, parent); ++ mutex_unlock(&mfs->mfs_lock); ++ err = mfs->mfs_bindex; ++ dput(parent); ++ ++ if (err >= 0) ++ err = au_wbr_nonopq(dentry, err); ++ ++out: ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++static int au_wbr_create_init_mfs(struct super_block *sb) ++{ ++ struct au_wbr_mfs *mfs; ++ ++ mfs = &au_sbi(sb)->si_wbr_mfs; ++ mutex_init(&mfs->mfs_lock); ++ mfs->mfs_jiffy = 0; ++ mfs->mfs_bindex = -EROFS; ++ ++ return 0; ++} ++ ++static int au_wbr_create_fin_mfs(struct super_block *sb __maybe_unused) ++{ ++ mutex_destroy(&au_sbi(sb)->si_wbr_mfs.mfs_lock); ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* most free space and then round robin */ ++static int au_wbr_create_mfsrr(struct dentry *dentry, unsigned int flags) ++{ ++ int err; ++ struct au_wbr_mfs *mfs; ++ ++ err = au_wbr_create_mfs(dentry, flags); ++ if (err >= 0) { ++ mfs = &au_sbi(dentry->d_sb)->si_wbr_mfs; ++ mutex_lock(&mfs->mfs_lock); ++ if (mfs->mfsrr_bytes < mfs->mfsrr_watermark) ++ err = au_wbr_create_rr(dentry, flags); ++ mutex_unlock(&mfs->mfs_lock); ++ } ++ ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++static int au_wbr_create_init_mfsrr(struct super_block *sb) ++{ ++ int err; ++ ++ au_wbr_create_init_mfs(sb); /* ignore */ ++ err = au_wbr_create_init_rr(sb); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* top down parent and most free space */ ++static int au_wbr_create_pmfs(struct dentry *dentry, unsigned int flags) ++{ ++ int err, e2; ++ unsigned long long b; ++ aufs_bindex_t bindex, btop, bbot; ++ struct super_block *sb; ++ struct dentry *parent, *h_parent; ++ struct au_branch *br; ++ ++ err = au_wbr_create_tdp(dentry, flags); ++ if (unlikely(err < 0)) ++ goto out; ++ parent = dget_parent(dentry); ++ btop = au_dbtop(parent); ++ bbot = au_dbtaildir(parent); ++ if (btop == bbot) ++ goto out_parent; /* success */ ++ ++ e2 = au_wbr_create_mfs(dentry, flags); ++ if (e2 < 0) ++ goto out_parent; /* success */ ++ ++ /* when the available size is equal, select upper one */ ++ sb = dentry->d_sb; ++ br = au_sbr(sb, err); ++ b = br->br_wbr->wbr_bytes; ++ AuDbg("b%d, %llu\n", err, b); ++ ++ for (bindex = btop; bindex <= bbot; bindex++) { ++ h_parent = au_h_dptr(parent, bindex); ++ if (!h_parent || d_is_negative(h_parent)) ++ continue; ++ ++ br = au_sbr(sb, bindex); ++ if (!au_br_rdonly(br) && br->br_wbr->wbr_bytes > b) { ++ b = br->br_wbr->wbr_bytes; ++ err = bindex; ++ AuDbg("b%d, %llu\n", err, b); ++ } ++ } ++ ++ if (err >= 0) ++ err = au_wbr_nonopq(dentry, err); ++ ++out_parent: ++ dput(parent); ++out: ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * - top down parent ++ * - most free space with parent ++ * - most free space round-robin regardless parent ++ */ ++static int au_wbr_create_pmfsrr(struct dentry *dentry, unsigned int flags) ++{ ++ int err; ++ unsigned long long watermark; ++ struct super_block *sb; ++ struct au_branch *br; ++ struct au_wbr_mfs *mfs; ++ ++ err = au_wbr_create_pmfs(dentry, flags | AuWbr_PARENT); ++ if (unlikely(err < 0)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ br = au_sbr(sb, err); ++ mfs = &au_sbi(sb)->si_wbr_mfs; ++ mutex_lock(&mfs->mfs_lock); ++ watermark = mfs->mfsrr_watermark; ++ mutex_unlock(&mfs->mfs_lock); ++ if (br->br_wbr->wbr_bytes < watermark) ++ /* regardless the parent dir */ ++ err = au_wbr_create_mfsrr(dentry, flags); ++ ++out: ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* policies for copyup */ ++ ++/* top down parent */ ++static int au_wbr_copyup_tdp(struct dentry *dentry) ++{ ++ return au_wbr_create_tdp(dentry, /*flags, anything is ok*/0); ++} ++ ++/* bottom up parent */ ++static int au_wbr_copyup_bup(struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bindex, btop; ++ struct dentry *parent, *h_parent; ++ struct super_block *sb; ++ ++ err = -EROFS; ++ sb = dentry->d_sb; ++ parent = dget_parent(dentry); ++ btop = au_dbtop(parent); ++ for (bindex = au_dbtop(dentry); bindex >= btop; bindex--) { ++ h_parent = au_h_dptr(parent, bindex); ++ if (!h_parent || d_is_negative(h_parent)) ++ continue; ++ ++ if (!au_br_rdonly(au_sbr(sb, bindex))) { ++ err = bindex; ++ break; ++ } ++ } ++ dput(parent); ++ ++ /* bottom up here */ ++ if (unlikely(err < 0)) ++ err = au_wbr_bu(sb, btop - 1); ++ ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++/* bottom up */ ++int au_wbr_do_copyup_bu(struct dentry *dentry, aufs_bindex_t btop) ++{ ++ int err; ++ ++ err = au_wbr_bu(dentry->d_sb, btop); ++ AuDbg("b%d\n", err); ++ if (err > btop) ++ err = au_wbr_nonopq(dentry, err); ++ ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++static int au_wbr_copyup_bu(struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t btop; ++ ++ btop = au_dbtop(dentry); ++ err = au_wbr_do_copyup_bu(dentry, btop); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_wbr_copyup_operations au_wbr_copyup_ops[] = { ++ [AuWbrCopyup_TDP] = { ++ .copyup = au_wbr_copyup_tdp ++ }, ++ [AuWbrCopyup_BUP] = { ++ .copyup = au_wbr_copyup_bup ++ }, ++ [AuWbrCopyup_BU] = { ++ .copyup = au_wbr_copyup_bu ++ } ++}; ++ ++struct au_wbr_create_operations au_wbr_create_ops[] = { ++ [AuWbrCreate_TDP] = { ++ .create = au_wbr_create_tdp ++ }, ++ [AuWbrCreate_RR] = { ++ .create = au_wbr_create_rr, ++ .init = au_wbr_create_init_rr ++ }, ++ [AuWbrCreate_MFS] = { ++ .create = au_wbr_create_mfs, ++ .init = au_wbr_create_init_mfs, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_MFSV] = { ++ .create = au_wbr_create_mfs, ++ .init = au_wbr_create_init_mfs, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_MFSRR] = { ++ .create = au_wbr_create_mfsrr, ++ .init = au_wbr_create_init_mfsrr, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_MFSRRV] = { ++ .create = au_wbr_create_mfsrr, ++ .init = au_wbr_create_init_mfsrr, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_PMFS] = { ++ .create = au_wbr_create_pmfs, ++ .init = au_wbr_create_init_mfs, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_PMFSV] = { ++ .create = au_wbr_create_pmfs, ++ .init = au_wbr_create_init_mfs, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_PMFSRR] = { ++ .create = au_wbr_create_pmfsrr, ++ .init = au_wbr_create_init_mfsrr, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_PMFSRRV] = { ++ .create = au_wbr_create_pmfsrr, ++ .init = au_wbr_create_init_mfsrr, ++ .fin = au_wbr_create_fin_mfs ++ } ++}; +diff --git a/fs/aufs/whout.c b/fs/aufs/whout.c +new file mode 100644 +index 0000000..4a781db +--- /dev/null ++++ b/fs/aufs/whout.c +@@ -0,0 +1,1060 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * whiteout for logical deletion and opaque directory ++ */ ++ ++#include "aufs.h" ++ ++#define WH_MASK S_IRUGO ++ ++/* ++ * If a directory contains this file, then it is opaque. We start with the ++ * .wh. flag so that it is blocked by lookup. ++ */ ++static struct qstr diropq_name = QSTR_INIT(AUFS_WH_DIROPQ, ++ sizeof(AUFS_WH_DIROPQ) - 1); ++ ++/* ++ * generate whiteout name, which is NOT terminated by NULL. ++ * @name: original d_name.name ++ * @len: original d_name.len ++ * @wh: whiteout qstr ++ * returns zero when succeeds, otherwise error. ++ * succeeded value as wh->name should be freed by kfree(). ++ */ ++int au_wh_name_alloc(struct qstr *wh, const struct qstr *name) ++{ ++ char *p; ++ ++ if (unlikely(name->len > PATH_MAX - AUFS_WH_PFX_LEN)) ++ return -ENAMETOOLONG; ++ ++ wh->len = name->len + AUFS_WH_PFX_LEN; ++ p = kmalloc(wh->len, GFP_NOFS); ++ wh->name = p; ++ if (p) { ++ memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); ++ memcpy(p + AUFS_WH_PFX_LEN, name->name, name->len); ++ /* smp_mb(); */ ++ return 0; ++ } ++ return -ENOMEM; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * test if the @wh_name exists under @h_parent. ++ * @try_sio specifies the necessary of super-io. ++ */ ++int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, int try_sio) ++{ ++ int err; ++ struct dentry *wh_dentry; ++ ++ if (!try_sio) ++ wh_dentry = vfsub_lkup_one(wh_name, h_parent); ++ else ++ wh_dentry = au_sio_lkup_one(wh_name, h_parent); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) { ++ if (err == -ENAMETOOLONG) ++ err = 0; ++ goto out; ++ } ++ ++ err = 0; ++ if (d_is_negative(wh_dentry)) ++ goto out_wh; /* success */ ++ ++ err = 1; ++ if (d_is_reg(wh_dentry)) ++ goto out_wh; /* success */ ++ ++ err = -EIO; ++ AuIOErr("%pd Invalid whiteout entry type 0%o.\n", ++ wh_dentry, d_inode(wh_dentry)->i_mode); ++ ++out_wh: ++ dput(wh_dentry); ++out: ++ return err; ++} ++ ++/* ++ * test if the @h_dentry sets opaque or not. ++ */ ++int au_diropq_test(struct dentry *h_dentry) ++{ ++ int err; ++ struct inode *h_dir; ++ ++ h_dir = d_inode(h_dentry); ++ err = au_wh_test(h_dentry, &diropq_name, ++ au_test_h_perm_sio(h_dir, MAY_EXEC)); ++ return err; ++} ++ ++/* ++ * returns a negative dentry whose name is unique and temporary. ++ */ ++struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br, ++ struct qstr *prefix) ++{ ++ struct dentry *dentry; ++ int i; ++ char defname[NAME_MAX - AUFS_MAX_NAMELEN + DNAME_INLINE_LEN + 1], ++ *name, *p; ++ /* strict atomic_t is unnecessary here */ ++ static unsigned short cnt; ++ struct qstr qs; ++ ++ BUILD_BUG_ON(sizeof(cnt) * 2 > AUFS_WH_TMP_LEN); ++ ++ name = defname; ++ qs.len = sizeof(defname) - DNAME_INLINE_LEN + prefix->len - 1; ++ if (unlikely(prefix->len > DNAME_INLINE_LEN)) { ++ dentry = ERR_PTR(-ENAMETOOLONG); ++ if (unlikely(qs.len > NAME_MAX)) ++ goto out; ++ dentry = ERR_PTR(-ENOMEM); ++ name = kmalloc(qs.len + 1, GFP_NOFS); ++ if (unlikely(!name)) ++ goto out; ++ } ++ ++ /* doubly whiteout-ed */ ++ memcpy(name, AUFS_WH_PFX AUFS_WH_PFX, AUFS_WH_PFX_LEN * 2); ++ p = name + AUFS_WH_PFX_LEN * 2; ++ memcpy(p, prefix->name, prefix->len); ++ p += prefix->len; ++ *p++ = '.'; ++ AuDebugOn(name + qs.len + 1 - p <= AUFS_WH_TMP_LEN); ++ ++ qs.name = name; ++ for (i = 0; i < 3; i++) { ++ sprintf(p, "%.*x", AUFS_WH_TMP_LEN, cnt++); ++ dentry = au_sio_lkup_one(&qs, h_parent); ++ if (IS_ERR(dentry) || d_is_negative(dentry)) ++ goto out_name; ++ dput(dentry); ++ } ++ /* pr_warn("could not get random name\n"); */ ++ dentry = ERR_PTR(-EEXIST); ++ AuDbg("%.*s\n", AuLNPair(&qs)); ++ BUG(); ++ ++out_name: ++ if (name != defname) ++ kfree(name); ++out: ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++/* ++ * rename the @h_dentry on @br to the whiteouted temporary name. ++ */ ++int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br) ++{ ++ int err; ++ struct path h_path = { ++ .mnt = au_br_mnt(br) ++ }; ++ struct inode *h_dir, *delegated; ++ struct dentry *h_parent; ++ ++ h_parent = h_dentry->d_parent; /* dir inode is locked */ ++ h_dir = d_inode(h_parent); ++ IMustLock(h_dir); ++ ++ h_path.dentry = au_whtmp_lkup(h_parent, br, &h_dentry->d_name); ++ err = PTR_ERR(h_path.dentry); ++ if (IS_ERR(h_path.dentry)) ++ goto out; ++ ++ /* under the same dir, no need to lock_rename() */ ++ delegated = NULL; ++ err = vfsub_rename(h_dir, h_dentry, h_dir, &h_path, &delegated); ++ AuTraceErr(err); ++ if (unlikely(err == -EWOULDBLOCK)) { ++ pr_warn("cannot retry for NFSv4 delegation" ++ " for an internal rename\n"); ++ iput(delegated); ++ } ++ dput(h_path.dentry); ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * functions for removing a whiteout ++ */ ++ ++static int do_unlink_wh(struct inode *h_dir, struct path *h_path) ++{ ++ int err, force; ++ struct inode *delegated; ++ ++ /* ++ * forces superio when the dir has a sticky bit. ++ * this may be a violation of unix fs semantics. ++ */ ++ force = (h_dir->i_mode & S_ISVTX) ++ && !uid_eq(current_fsuid(), d_inode(h_path->dentry)->i_uid); ++ delegated = NULL; ++ err = vfsub_unlink(h_dir, h_path, &delegated, force); ++ if (unlikely(err == -EWOULDBLOCK)) { ++ pr_warn("cannot retry for NFSv4 delegation" ++ " for an internal unlink\n"); ++ iput(delegated); ++ } ++ return err; ++} ++ ++int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path, ++ struct dentry *dentry) ++{ ++ int err; ++ ++ err = do_unlink_wh(h_dir, h_path); ++ if (!err && dentry) ++ au_set_dbwh(dentry, -1); ++ ++ return err; ++} ++ ++static int unlink_wh_name(struct dentry *h_parent, struct qstr *wh, ++ struct au_branch *br) ++{ ++ int err; ++ struct path h_path = { ++ .mnt = au_br_mnt(br) ++ }; ++ ++ err = 0; ++ h_path.dentry = vfsub_lkup_one(wh, h_parent); ++ if (IS_ERR(h_path.dentry)) ++ err = PTR_ERR(h_path.dentry); ++ else { ++ if (d_is_reg(h_path.dentry)) ++ err = do_unlink_wh(d_inode(h_parent), &h_path); ++ dput(h_path.dentry); ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * initialize/clean whiteout for a branch ++ */ ++ ++static void au_wh_clean(struct inode *h_dir, struct path *whpath, ++ const int isdir) ++{ ++ int err; ++ struct inode *delegated; ++ ++ if (d_is_negative(whpath->dentry)) ++ return; ++ ++ if (isdir) ++ err = vfsub_rmdir(h_dir, whpath); ++ else { ++ delegated = NULL; ++ err = vfsub_unlink(h_dir, whpath, &delegated, /*force*/0); ++ if (unlikely(err == -EWOULDBLOCK)) { ++ pr_warn("cannot retry for NFSv4 delegation" ++ " for an internal unlink\n"); ++ iput(delegated); ++ } ++ } ++ if (unlikely(err)) ++ pr_warn("failed removing %pd (%d), ignored.\n", ++ whpath->dentry, err); ++} ++ ++static int test_linkable(struct dentry *h_root) ++{ ++ struct inode *h_dir = d_inode(h_root); ++ ++ if (h_dir->i_op->link) ++ return 0; ++ ++ pr_err("%pd (%s) doesn't support link(2), use noplink and rw+nolwh\n", ++ h_root, au_sbtype(h_root->d_sb)); ++ return -ENOSYS; ++} ++ ++/* todo: should this mkdir be done in /sbin/mount.aufs helper? */ ++static int au_whdir(struct inode *h_dir, struct path *path) ++{ ++ int err; ++ ++ err = -EEXIST; ++ if (d_is_negative(path->dentry)) { ++ int mode = S_IRWXU; ++ ++ if (au_test_nfs(path->dentry->d_sb)) ++ mode |= S_IXUGO; ++ err = vfsub_mkdir(h_dir, path, mode); ++ } else if (d_is_dir(path->dentry)) ++ err = 0; ++ else ++ pr_err("unknown %pd exists\n", path->dentry); ++ ++ return err; ++} ++ ++struct au_wh_base { ++ const struct qstr *name; ++ struct dentry *dentry; ++}; ++ ++static void au_wh_init_ro(struct inode *h_dir, struct au_wh_base base[], ++ struct path *h_path) ++{ ++ h_path->dentry = base[AuBrWh_BASE].dentry; ++ au_wh_clean(h_dir, h_path, /*isdir*/0); ++ h_path->dentry = base[AuBrWh_PLINK].dentry; ++ au_wh_clean(h_dir, h_path, /*isdir*/1); ++ h_path->dentry = base[AuBrWh_ORPH].dentry; ++ au_wh_clean(h_dir, h_path, /*isdir*/1); ++} ++ ++/* ++ * returns tri-state, ++ * minus: error, caller should print the message ++ * zero: succuess ++ * plus: error, caller should NOT print the message ++ */ ++static int au_wh_init_rw_nolink(struct dentry *h_root, struct au_wbr *wbr, ++ int do_plink, struct au_wh_base base[], ++ struct path *h_path) ++{ ++ int err; ++ struct inode *h_dir; ++ ++ h_dir = d_inode(h_root); ++ h_path->dentry = base[AuBrWh_BASE].dentry; ++ au_wh_clean(h_dir, h_path, /*isdir*/0); ++ h_path->dentry = base[AuBrWh_PLINK].dentry; ++ if (do_plink) { ++ err = test_linkable(h_root); ++ if (unlikely(err)) { ++ err = 1; ++ goto out; ++ } ++ ++ err = au_whdir(h_dir, h_path); ++ if (unlikely(err)) ++ goto out; ++ wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry); ++ } else ++ au_wh_clean(h_dir, h_path, /*isdir*/1); ++ h_path->dentry = base[AuBrWh_ORPH].dentry; ++ err = au_whdir(h_dir, h_path); ++ if (unlikely(err)) ++ goto out; ++ wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry); ++ ++out: ++ return err; ++} ++ ++/* ++ * for the moment, aufs supports the branch filesystem which does not support ++ * link(2). testing on FAT which does not support i_op->setattr() fully either, ++ * copyup failed. finally, such filesystem will not be used as the writable ++ * branch. ++ * ++ * returns tri-state, see above. ++ */ ++static int au_wh_init_rw(struct dentry *h_root, struct au_wbr *wbr, ++ int do_plink, struct au_wh_base base[], ++ struct path *h_path) ++{ ++ int err; ++ struct inode *h_dir; ++ ++ WbrWhMustWriteLock(wbr); ++ ++ err = test_linkable(h_root); ++ if (unlikely(err)) { ++ err = 1; ++ goto out; ++ } ++ ++ /* ++ * todo: should this create be done in /sbin/mount.aufs helper? ++ */ ++ err = -EEXIST; ++ h_dir = d_inode(h_root); ++ if (d_is_negative(base[AuBrWh_BASE].dentry)) { ++ h_path->dentry = base[AuBrWh_BASE].dentry; ++ err = vfsub_create(h_dir, h_path, WH_MASK, /*want_excl*/true); ++ } else if (d_is_reg(base[AuBrWh_BASE].dentry)) ++ err = 0; ++ else ++ pr_err("unknown %pd2 exists\n", base[AuBrWh_BASE].dentry); ++ if (unlikely(err)) ++ goto out; ++ ++ h_path->dentry = base[AuBrWh_PLINK].dentry; ++ if (do_plink) { ++ err = au_whdir(h_dir, h_path); ++ if (unlikely(err)) ++ goto out; ++ wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry); ++ } else ++ au_wh_clean(h_dir, h_path, /*isdir*/1); ++ wbr->wbr_whbase = dget(base[AuBrWh_BASE].dentry); ++ ++ h_path->dentry = base[AuBrWh_ORPH].dentry; ++ err = au_whdir(h_dir, h_path); ++ if (unlikely(err)) ++ goto out; ++ wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry); ++ ++out: ++ return err; ++} ++ ++/* ++ * initialize the whiteout base file/dir for @br. ++ */ ++int au_wh_init(struct au_branch *br, struct super_block *sb) ++{ ++ int err, i; ++ const unsigned char do_plink ++ = !!au_opt_test(au_mntflags(sb), PLINK); ++ struct inode *h_dir; ++ struct path path = br->br_path; ++ struct dentry *h_root = path.dentry; ++ struct au_wbr *wbr = br->br_wbr; ++ static const struct qstr base_name[] = { ++ [AuBrWh_BASE] = QSTR_INIT(AUFS_BASE_NAME, ++ sizeof(AUFS_BASE_NAME) - 1), ++ [AuBrWh_PLINK] = QSTR_INIT(AUFS_PLINKDIR_NAME, ++ sizeof(AUFS_PLINKDIR_NAME) - 1), ++ [AuBrWh_ORPH] = QSTR_INIT(AUFS_ORPHDIR_NAME, ++ sizeof(AUFS_ORPHDIR_NAME) - 1) ++ }; ++ struct au_wh_base base[] = { ++ [AuBrWh_BASE] = { ++ .name = base_name + AuBrWh_BASE, ++ .dentry = NULL ++ }, ++ [AuBrWh_PLINK] = { ++ .name = base_name + AuBrWh_PLINK, ++ .dentry = NULL ++ }, ++ [AuBrWh_ORPH] = { ++ .name = base_name + AuBrWh_ORPH, ++ .dentry = NULL ++ } ++ }; ++ ++ if (wbr) ++ WbrWhMustWriteLock(wbr); ++ ++ for (i = 0; i < AuBrWh_Last; i++) { ++ /* doubly whiteouted */ ++ struct dentry *d; ++ ++ d = au_wh_lkup(h_root, (void *)base[i].name, br); ++ err = PTR_ERR(d); ++ if (IS_ERR(d)) ++ goto out; ++ ++ base[i].dentry = d; ++ AuDebugOn(wbr ++ && wbr->wbr_wh[i] ++ && wbr->wbr_wh[i] != base[i].dentry); ++ } ++ ++ if (wbr) ++ for (i = 0; i < AuBrWh_Last; i++) { ++ dput(wbr->wbr_wh[i]); ++ wbr->wbr_wh[i] = NULL; ++ } ++ ++ err = 0; ++ if (!au_br_writable(br->br_perm)) { ++ h_dir = d_inode(h_root); ++ au_wh_init_ro(h_dir, base, &path); ++ } else if (!au_br_wh_linkable(br->br_perm)) { ++ err = au_wh_init_rw_nolink(h_root, wbr, do_plink, base, &path); ++ if (err > 0) ++ goto out; ++ else if (err) ++ goto out_err; ++ } else { ++ err = au_wh_init_rw(h_root, wbr, do_plink, base, &path); ++ if (err > 0) ++ goto out; ++ else if (err) ++ goto out_err; ++ } ++ goto out; /* success */ ++ ++out_err: ++ pr_err("an error(%d) on the writable branch %pd(%s)\n", ++ err, h_root, au_sbtype(h_root->d_sb)); ++out: ++ for (i = 0; i < AuBrWh_Last; i++) ++ dput(base[i].dentry); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * whiteouts are all hard-linked usually. ++ * when its link count reaches a ceiling, we create a new whiteout base ++ * asynchronously. ++ */ ++ ++struct reinit_br_wh { ++ struct super_block *sb; ++ struct au_branch *br; ++}; ++ ++static void reinit_br_wh(void *arg) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ struct path h_path; ++ struct reinit_br_wh *a = arg; ++ struct au_wbr *wbr; ++ struct inode *dir, *delegated; ++ struct dentry *h_root; ++ struct au_hinode *hdir; ++ ++ err = 0; ++ wbr = a->br->br_wbr; ++ /* big aufs lock */ ++ si_noflush_write_lock(a->sb); ++ if (!au_br_writable(a->br->br_perm)) ++ goto out; ++ bindex = au_br_index(a->sb, a->br->br_id); ++ if (unlikely(bindex < 0)) ++ goto out; ++ ++ di_read_lock_parent(a->sb->s_root, AuLock_IR); ++ dir = d_inode(a->sb->s_root); ++ hdir = au_hi(dir, bindex); ++ h_root = au_h_dptr(a->sb->s_root, bindex); ++ AuDebugOn(h_root != au_br_dentry(a->br)); ++ ++ au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ wbr_wh_write_lock(wbr); ++ err = au_h_verify(wbr->wbr_whbase, au_opt_udba(a->sb), hdir->hi_inode, ++ h_root, a->br); ++ if (!err) { ++ h_path.dentry = wbr->wbr_whbase; ++ h_path.mnt = au_br_mnt(a->br); ++ delegated = NULL; ++ err = vfsub_unlink(hdir->hi_inode, &h_path, &delegated, ++ /*force*/0); ++ if (unlikely(err == -EWOULDBLOCK)) { ++ pr_warn("cannot retry for NFSv4 delegation" ++ " for an internal unlink\n"); ++ iput(delegated); ++ } ++ } else { ++ pr_warn("%pd is moved, ignored\n", wbr->wbr_whbase); ++ err = 0; ++ } ++ dput(wbr->wbr_whbase); ++ wbr->wbr_whbase = NULL; ++ if (!err) ++ err = au_wh_init(a->br, a->sb); ++ wbr_wh_write_unlock(wbr); ++ au_hn_imtx_unlock(hdir); ++ di_read_unlock(a->sb->s_root, AuLock_IR); ++ if (!err) ++ au_fhsm_wrote(a->sb, bindex, /*force*/0); ++ ++out: ++ if (wbr) ++ atomic_dec(&wbr->wbr_wh_running); ++ au_br_put(a->br); ++ si_write_unlock(a->sb); ++ au_nwt_done(&au_sbi(a->sb)->si_nowait); ++ kfree(arg); ++ if (unlikely(err)) ++ AuIOErr("err %d\n", err); ++} ++ ++static void kick_reinit_br_wh(struct super_block *sb, struct au_branch *br) ++{ ++ int do_dec, wkq_err; ++ struct reinit_br_wh *arg; ++ ++ do_dec = 1; ++ if (atomic_inc_return(&br->br_wbr->wbr_wh_running) != 1) ++ goto out; ++ ++ /* ignore ENOMEM */ ++ arg = kmalloc(sizeof(*arg), GFP_NOFS); ++ if (arg) { ++ /* ++ * dec(wh_running), kfree(arg) and dec(br_count) ++ * in reinit function ++ */ ++ arg->sb = sb; ++ arg->br = br; ++ au_br_get(br); ++ wkq_err = au_wkq_nowait(reinit_br_wh, arg, sb, /*flags*/0); ++ if (unlikely(wkq_err)) { ++ atomic_dec(&br->br_wbr->wbr_wh_running); ++ au_br_put(br); ++ kfree(arg); ++ } ++ do_dec = 0; ++ } ++ ++out: ++ if (do_dec) ++ atomic_dec(&br->br_wbr->wbr_wh_running); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * create the whiteout @wh. ++ */ ++static int link_or_create_wh(struct super_block *sb, aufs_bindex_t bindex, ++ struct dentry *wh) ++{ ++ int err; ++ struct path h_path = { ++ .dentry = wh ++ }; ++ struct au_branch *br; ++ struct au_wbr *wbr; ++ struct dentry *h_parent; ++ struct inode *h_dir, *delegated; ++ ++ h_parent = wh->d_parent; /* dir inode is locked */ ++ h_dir = d_inode(h_parent); ++ IMustLock(h_dir); ++ ++ br = au_sbr(sb, bindex); ++ h_path.mnt = au_br_mnt(br); ++ wbr = br->br_wbr; ++ wbr_wh_read_lock(wbr); ++ if (wbr->wbr_whbase) { ++ delegated = NULL; ++ err = vfsub_link(wbr->wbr_whbase, h_dir, &h_path, &delegated); ++ if (unlikely(err == -EWOULDBLOCK)) { ++ pr_warn("cannot retry for NFSv4 delegation" ++ " for an internal link\n"); ++ iput(delegated); ++ } ++ if (!err || err != -EMLINK) ++ goto out; ++ ++ /* link count full. re-initialize br_whbase. */ ++ kick_reinit_br_wh(sb, br); ++ } ++ ++ /* return this error in this context */ ++ err = vfsub_create(h_dir, &h_path, WH_MASK, /*want_excl*/true); ++ if (!err) ++ au_fhsm_wrote(sb, bindex, /*force*/0); ++ ++out: ++ wbr_wh_read_unlock(wbr); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * create or remove the diropq. ++ */ ++static struct dentry *do_diropq(struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int flags) ++{ ++ struct dentry *opq_dentry, *h_dentry; ++ struct super_block *sb; ++ struct au_branch *br; ++ int err; ++ ++ sb = dentry->d_sb; ++ br = au_sbr(sb, bindex); ++ h_dentry = au_h_dptr(dentry, bindex); ++ opq_dentry = vfsub_lkup_one(&diropq_name, h_dentry); ++ if (IS_ERR(opq_dentry)) ++ goto out; ++ ++ if (au_ftest_diropq(flags, CREATE)) { ++ err = link_or_create_wh(sb, bindex, opq_dentry); ++ if (!err) { ++ au_set_dbdiropq(dentry, bindex); ++ goto out; /* success */ ++ } ++ } else { ++ struct path tmp = { ++ .dentry = opq_dentry, ++ .mnt = au_br_mnt(br) ++ }; ++ err = do_unlink_wh(au_h_iptr(d_inode(dentry), bindex), &tmp); ++ if (!err) ++ au_set_dbdiropq(dentry, -1); ++ } ++ dput(opq_dentry); ++ opq_dentry = ERR_PTR(err); ++ ++out: ++ return opq_dentry; ++} ++ ++struct do_diropq_args { ++ struct dentry **errp; ++ struct dentry *dentry; ++ aufs_bindex_t bindex; ++ unsigned int flags; ++}; ++ ++static void call_do_diropq(void *args) ++{ ++ struct do_diropq_args *a = args; ++ *a->errp = do_diropq(a->dentry, a->bindex, a->flags); ++} ++ ++struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int flags) ++{ ++ struct dentry *diropq, *h_dentry; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!au_test_h_perm_sio(d_inode(h_dentry), MAY_EXEC | MAY_WRITE)) ++ diropq = do_diropq(dentry, bindex, flags); ++ else { ++ int wkq_err; ++ struct do_diropq_args args = { ++ .errp = &diropq, ++ .dentry = dentry, ++ .bindex = bindex, ++ .flags = flags ++ }; ++ ++ wkq_err = au_wkq_wait(call_do_diropq, &args); ++ if (unlikely(wkq_err)) ++ diropq = ERR_PTR(wkq_err); ++ } ++ ++ return diropq; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * lookup whiteout dentry. ++ * @h_parent: lower parent dentry which must exist and be locked ++ * @base_name: name of dentry which will be whiteouted ++ * returns dentry for whiteout. ++ */ ++struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name, ++ struct au_branch *br) ++{ ++ int err; ++ struct qstr wh_name; ++ struct dentry *wh_dentry; ++ ++ err = au_wh_name_alloc(&wh_name, base_name); ++ wh_dentry = ERR_PTR(err); ++ if (!err) { ++ wh_dentry = vfsub_lkup_one(&wh_name, h_parent); ++ kfree(wh_name.name); ++ } ++ return wh_dentry; ++} ++ ++/* ++ * link/create a whiteout for @dentry on @bindex. ++ */ ++struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent) ++{ ++ struct dentry *wh_dentry; ++ struct super_block *sb; ++ int err; ++ ++ sb = dentry->d_sb; ++ wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, au_sbr(sb, bindex)); ++ if (!IS_ERR(wh_dentry) && d_is_negative(wh_dentry)) { ++ err = link_or_create_wh(sb, bindex, wh_dentry); ++ if (!err) { ++ au_set_dbwh(dentry, bindex); ++ au_fhsm_wrote(sb, bindex, /*force*/0); ++ } else { ++ dput(wh_dentry); ++ wh_dentry = ERR_PTR(err); ++ } ++ } ++ ++ return wh_dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* Delete all whiteouts in this directory on branch bindex. */ ++static int del_wh_children(struct dentry *h_dentry, struct au_nhash *whlist, ++ aufs_bindex_t bindex, struct au_branch *br) ++{ ++ int err; ++ unsigned long ul, n; ++ struct qstr wh_name; ++ char *p; ++ struct hlist_head *head; ++ struct au_vdir_wh *pos; ++ struct au_vdir_destr *str; ++ ++ err = -ENOMEM; ++ p = (void *)__get_free_page(GFP_NOFS); ++ wh_name.name = p; ++ if (unlikely(!wh_name.name)) ++ goto out; ++ ++ err = 0; ++ memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); ++ p += AUFS_WH_PFX_LEN; ++ n = whlist->nh_num; ++ head = whlist->nh_head; ++ for (ul = 0; !err && ul < n; ul++, head++) { ++ hlist_for_each_entry(pos, head, wh_hash) { ++ if (pos->wh_bindex != bindex) ++ continue; ++ ++ str = &pos->wh_str; ++ if (str->len + AUFS_WH_PFX_LEN <= PATH_MAX) { ++ memcpy(p, str->name, str->len); ++ wh_name.len = AUFS_WH_PFX_LEN + str->len; ++ err = unlink_wh_name(h_dentry, &wh_name, br); ++ if (!err) ++ continue; ++ break; ++ } ++ AuIOErr("whiteout name too long %.*s\n", ++ str->len, str->name); ++ err = -EIO; ++ break; ++ } ++ } ++ free_page((unsigned long)wh_name.name); ++ ++out: ++ return err; ++} ++ ++struct del_wh_children_args { ++ int *errp; ++ struct dentry *h_dentry; ++ struct au_nhash *whlist; ++ aufs_bindex_t bindex; ++ struct au_branch *br; ++}; ++ ++static void call_del_wh_children(void *args) ++{ ++ struct del_wh_children_args *a = args; ++ *a->errp = del_wh_children(a->h_dentry, a->whlist, a->bindex, a->br); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp) ++{ ++ struct au_whtmp_rmdir *whtmp; ++ int err; ++ unsigned int rdhash; ++ ++ SiMustAnyLock(sb); ++ ++ whtmp = kzalloc(sizeof(*whtmp), gfp); ++ if (unlikely(!whtmp)) { ++ whtmp = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ /* no estimation for dir size */ ++ rdhash = au_sbi(sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = AUFS_RDHASH_DEF; ++ err = au_nhash_alloc(&whtmp->whlist, rdhash, gfp); ++ if (unlikely(err)) { ++ kfree(whtmp); ++ whtmp = ERR_PTR(err); ++ } ++ ++out: ++ return whtmp; ++} ++ ++void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp) ++{ ++ if (whtmp->br) ++ au_br_put(whtmp->br); ++ dput(whtmp->wh_dentry); ++ iput(whtmp->dir); ++ au_nhash_wh_free(&whtmp->whlist); ++ kfree(whtmp); ++} ++ ++/* ++ * rmdir the whiteouted temporary named dir @h_dentry. ++ * @whlist: whiteouted children. ++ */ ++int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct au_nhash *whlist) ++{ ++ int err; ++ unsigned int h_nlink; ++ struct path h_tmp; ++ struct inode *wh_inode, *h_dir; ++ struct au_branch *br; ++ ++ h_dir = d_inode(wh_dentry->d_parent); /* dir inode is locked */ ++ IMustLock(h_dir); ++ ++ br = au_sbr(dir->i_sb, bindex); ++ wh_inode = d_inode(wh_dentry); ++ inode_lock_nested(wh_inode, AuLsc_I_CHILD); ++ ++ /* ++ * someone else might change some whiteouts while we were sleeping. ++ * it means this whlist may have an obsoleted entry. ++ */ ++ if (!au_test_h_perm_sio(wh_inode, MAY_EXEC | MAY_WRITE)) ++ err = del_wh_children(wh_dentry, whlist, bindex, br); ++ else { ++ int wkq_err; ++ struct del_wh_children_args args = { ++ .errp = &err, ++ .h_dentry = wh_dentry, ++ .whlist = whlist, ++ .bindex = bindex, ++ .br = br ++ }; ++ ++ wkq_err = au_wkq_wait(call_del_wh_children, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ inode_unlock(wh_inode); ++ ++ if (!err) { ++ h_tmp.dentry = wh_dentry; ++ h_tmp.mnt = au_br_mnt(br); ++ h_nlink = h_dir->i_nlink; ++ err = vfsub_rmdir(h_dir, &h_tmp); ++ /* some fs doesn't change the parent nlink in some cases */ ++ h_nlink -= h_dir->i_nlink; ++ } ++ ++ if (!err) { ++ if (au_ibtop(dir) == bindex) { ++ /* todo: dir->i_mutex is necessary */ ++ au_cpup_attr_timesizes(dir); ++ if (h_nlink) ++ vfsub_drop_nlink(dir); ++ } ++ return 0; /* success */ ++ } ++ ++ pr_warn("failed removing %pd(%d), ignored\n", wh_dentry, err); ++ return err; ++} ++ ++static void call_rmdir_whtmp(void *args) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ struct au_whtmp_rmdir *a = args; ++ struct super_block *sb; ++ struct dentry *h_parent; ++ struct inode *h_dir; ++ struct au_hinode *hdir; ++ ++ /* rmdir by nfsd may cause deadlock with this i_mutex */ ++ /* inode_lock(a->dir); */ ++ err = -EROFS; ++ sb = a->dir->i_sb; ++ si_read_lock(sb, !AuLock_FLUSH); ++ if (!au_br_writable(a->br->br_perm)) ++ goto out; ++ bindex = au_br_index(sb, a->br->br_id); ++ if (unlikely(bindex < 0)) ++ goto out; ++ ++ err = -EIO; ++ ii_write_lock_parent(a->dir); ++ h_parent = dget_parent(a->wh_dentry); ++ h_dir = d_inode(h_parent); ++ hdir = au_hi(a->dir, bindex); ++ err = vfsub_mnt_want_write(au_br_mnt(a->br)); ++ if (unlikely(err)) ++ goto out_mnt; ++ au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ err = au_h_verify(a->wh_dentry, au_opt_udba(sb), h_dir, h_parent, ++ a->br); ++ if (!err) ++ err = au_whtmp_rmdir(a->dir, bindex, a->wh_dentry, &a->whlist); ++ au_hn_imtx_unlock(hdir); ++ vfsub_mnt_drop_write(au_br_mnt(a->br)); ++ ++out_mnt: ++ dput(h_parent); ++ ii_write_unlock(a->dir); ++out: ++ /* inode_unlock(a->dir); */ ++ au_whtmp_rmdir_free(a); ++ si_read_unlock(sb); ++ au_nwt_done(&au_sbi(sb)->si_nowait); ++ if (unlikely(err)) ++ AuIOErr("err %d\n", err); ++} ++ ++void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct au_whtmp_rmdir *args) ++{ ++ int wkq_err; ++ struct super_block *sb; ++ ++ IMustLock(dir); ++ ++ /* all post-process will be done in do_rmdir_whtmp(). */ ++ sb = dir->i_sb; ++ args->dir = au_igrab(dir); ++ args->br = au_sbr(sb, bindex); ++ au_br_get(args->br); ++ args->wh_dentry = dget(wh_dentry); ++ wkq_err = au_wkq_nowait(call_rmdir_whtmp, args, sb, /*flags*/0); ++ if (unlikely(wkq_err)) { ++ pr_warn("rmdir error %pd (%d), ignored\n", wh_dentry, wkq_err); ++ au_whtmp_rmdir_free(args); ++ } ++} +diff --git a/fs/aufs/whout.h b/fs/aufs/whout.h +new file mode 100644 +index 0000000..5a5c378 +--- /dev/null ++++ b/fs/aufs/whout.h +@@ -0,0 +1,85 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * whiteout for logical deletion and opaque directory ++ */ ++ ++#ifndef __AUFS_WHOUT_H__ ++#define __AUFS_WHOUT_H__ ++ ++#ifdef __KERNEL__ ++ ++#include "dir.h" ++ ++/* whout.c */ ++int au_wh_name_alloc(struct qstr *wh, const struct qstr *name); ++int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, int try_sio); ++int au_diropq_test(struct dentry *h_dentry); ++struct au_branch; ++struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br, ++ struct qstr *prefix); ++int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br); ++int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path, ++ struct dentry *dentry); ++int au_wh_init(struct au_branch *br, struct super_block *sb); ++ ++/* diropq flags */ ++#define AuDiropq_CREATE 1 ++#define au_ftest_diropq(flags, name) ((flags) & AuDiropq_##name) ++#define au_fset_diropq(flags, name) \ ++ do { (flags) |= AuDiropq_##name; } while (0) ++#define au_fclr_diropq(flags, name) \ ++ do { (flags) &= ~AuDiropq_##name; } while (0) ++ ++struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int flags); ++struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name, ++ struct au_branch *br); ++struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent); ++ ++/* real rmdir for the whiteout-ed dir */ ++struct au_whtmp_rmdir { ++ struct inode *dir; ++ struct au_branch *br; ++ struct dentry *wh_dentry; ++ struct au_nhash whlist; ++}; ++ ++struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp); ++void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp); ++int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct au_nhash *whlist); ++void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct au_whtmp_rmdir *args); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct dentry *au_diropq_create(struct dentry *dentry, ++ aufs_bindex_t bindex) ++{ ++ return au_diropq_sio(dentry, bindex, AuDiropq_CREATE); ++} ++ ++static inline int au_diropq_remove(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ return PTR_ERR(au_diropq_sio(dentry, bindex, !AuDiropq_CREATE)); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_WHOUT_H__ */ +diff --git a/fs/aufs/wkq.c b/fs/aufs/wkq.c +new file mode 100644 +index 0000000..8c17dc7 +--- /dev/null ++++ b/fs/aufs/wkq.c +@@ -0,0 +1,218 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * workqueue for asynchronous/super-io operations ++ * todo: try new dredential scheme ++ */ ++ ++#include ++#include "aufs.h" ++ ++/* internal workqueue named AUFS_WKQ_NAME */ ++ ++static struct workqueue_struct *au_wkq; ++ ++struct au_wkinfo { ++ struct work_struct wk; ++ struct kobject *kobj; ++ ++ unsigned int flags; /* see wkq.h */ ++ ++ au_wkq_func_t func; ++ void *args; ++ ++ struct completion *comp; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void wkq_func(struct work_struct *wk) ++{ ++ struct au_wkinfo *wkinfo = container_of(wk, struct au_wkinfo, wk); ++ ++ AuDebugOn(!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)); ++ AuDebugOn(rlimit(RLIMIT_FSIZE) != RLIM_INFINITY); ++ ++ wkinfo->func(wkinfo->args); ++ if (au_ftest_wkq(wkinfo->flags, WAIT)) ++ complete(wkinfo->comp); ++ else { ++ kobject_put(wkinfo->kobj); ++ module_put(THIS_MODULE); /* todo: ?? */ ++ kfree(wkinfo); ++ } ++} ++ ++/* ++ * Since struct completion is large, try allocating it dynamically. ++ */ ++#if 1 /* defined(CONFIG_4KSTACKS) || defined(AuTest4KSTACKS) */ ++#define AuWkqCompDeclare(name) struct completion *comp = NULL ++ ++static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp) ++{ ++ *comp = kmalloc(sizeof(**comp), GFP_NOFS); ++ if (*comp) { ++ init_completion(*comp); ++ wkinfo->comp = *comp; ++ return 0; ++ } ++ return -ENOMEM; ++} ++ ++static void au_wkq_comp_free(struct completion *comp) ++{ ++ kfree(comp); ++} ++ ++#else ++ ++/* no braces */ ++#define AuWkqCompDeclare(name) \ ++ DECLARE_COMPLETION_ONSTACK(_ ## name); \ ++ struct completion *comp = &_ ## name ++ ++static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp) ++{ ++ wkinfo->comp = *comp; ++ return 0; ++} ++ ++static void au_wkq_comp_free(struct completion *comp __maybe_unused) ++{ ++ /* empty */ ++} ++#endif /* 4KSTACKS */ ++ ++static void au_wkq_run(struct au_wkinfo *wkinfo) ++{ ++ if (au_ftest_wkq(wkinfo->flags, NEST)) { ++ if (au_wkq_test()) { ++ AuWarn1("wkq from wkq, unless silly-rename on NFS," ++ " due to a dead dir by UDBA?\n"); ++ AuDebugOn(au_ftest_wkq(wkinfo->flags, WAIT)); ++ } ++ } else ++ au_dbg_verify_kthread(); ++ ++ if (au_ftest_wkq(wkinfo->flags, WAIT)) { ++ INIT_WORK_ONSTACK(&wkinfo->wk, wkq_func); ++ queue_work(au_wkq, &wkinfo->wk); ++ } else { ++ INIT_WORK(&wkinfo->wk, wkq_func); ++ schedule_work(&wkinfo->wk); ++ } ++} ++ ++/* ++ * Be careful. It is easy to make deadlock happen. ++ * processA: lock, wkq and wait ++ * processB: wkq and wait, lock in wkq ++ * --> deadlock ++ */ ++int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args) ++{ ++ int err; ++ AuWkqCompDeclare(comp); ++ struct au_wkinfo wkinfo = { ++ .flags = flags, ++ .func = func, ++ .args = args ++ }; ++ ++ err = au_wkq_comp_alloc(&wkinfo, &comp); ++ if (!err) { ++ au_wkq_run(&wkinfo); ++ /* no timeout, no interrupt */ ++ wait_for_completion(wkinfo.comp); ++ au_wkq_comp_free(comp); ++ destroy_work_on_stack(&wkinfo.wk); ++ } ++ ++ return err; ++ ++} ++ ++/* ++ * Note: dget/dput() in func for aufs dentries are not supported. It will be a ++ * problem in a concurrent umounting. ++ */ ++int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb, ++ unsigned int flags) ++{ ++ int err; ++ struct au_wkinfo *wkinfo; ++ ++ percpu_counter_inc(&au_sbi(sb)->si_nowait.nw_len); ++ ++ /* ++ * wkq_func() must free this wkinfo. ++ * it highly depends upon the implementation of workqueue. ++ */ ++ err = 0; ++ wkinfo = kmalloc(sizeof(*wkinfo), GFP_NOFS); ++ if (wkinfo) { ++ wkinfo->kobj = &au_sbi(sb)->si_kobj; ++ wkinfo->flags = flags & ~AuWkq_WAIT; ++ wkinfo->func = func; ++ wkinfo->args = args; ++ wkinfo->comp = NULL; ++ kobject_get(wkinfo->kobj); ++ __module_get(THIS_MODULE); /* todo: ?? */ ++ ++ au_wkq_run(wkinfo); ++ } else { ++ err = -ENOMEM; ++ au_nwt_done(&au_sbi(sb)->si_nowait); ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_nwt_init(struct au_nowait_tasks *nwt) ++{ ++ percpu_counter_init(&nwt->nw_len, 0, GFP_NOFS); ++ init_waitqueue_head(&nwt->nw_wq); ++} ++ ++void au_nwt_fin(struct au_nowait_tasks *nwt) ++{ ++ AuDebugOn(percpu_counter_sum(&nwt->nw_len)); ++ percpu_counter_destroy(&nwt->nw_len); ++} ++ ++void au_wkq_fin(void) ++{ ++ destroy_workqueue(au_wkq); ++} ++ ++int __init au_wkq_init(void) ++{ ++ int err; ++ ++ err = 0; ++ au_wkq = alloc_workqueue(AUFS_WKQ_NAME, 0, WQ_DFL_ACTIVE); ++ if (IS_ERR(au_wkq)) ++ err = PTR_ERR(au_wkq); ++ else if (!au_wkq) ++ err = -ENOMEM; ++ ++ return err; ++} +diff --git a/fs/aufs/wkq.h b/fs/aufs/wkq.h +new file mode 100644 +index 0000000..7314558 +--- /dev/null ++++ b/fs/aufs/wkq.h +@@ -0,0 +1,93 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * workqueue for asynchronous/super-io operations ++ * todo: try new credentials management scheme ++ */ ++ ++#ifndef __AUFS_WKQ_H__ ++#define __AUFS_WKQ_H__ ++ ++#ifdef __KERNEL__ ++ ++struct super_block; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * in the next operation, wait for the 'nowait' tasks in system-wide workqueue ++ */ ++struct au_nowait_tasks { ++ struct percpu_counter nw_len; ++ wait_queue_head_t nw_wq; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++typedef void (*au_wkq_func_t)(void *args); ++ ++/* wkq flags */ ++#define AuWkq_WAIT 1 ++#define AuWkq_NEST (1 << 1) ++#define au_ftest_wkq(flags, name) ((flags) & AuWkq_##name) ++#define au_fset_wkq(flags, name) \ ++ do { (flags) |= AuWkq_##name; } while (0) ++#define au_fclr_wkq(flags, name) \ ++ do { (flags) &= ~AuWkq_##name; } while (0) ++ ++#ifndef CONFIG_AUFS_HNOTIFY ++#undef AuWkq_NEST ++#define AuWkq_NEST 0 ++#endif ++ ++/* wkq.c */ ++int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args); ++int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb, ++ unsigned int flags); ++void au_nwt_init(struct au_nowait_tasks *nwt); ++void au_nwt_fin(struct au_nowait_tasks *nwt); ++int __init au_wkq_init(void); ++void au_wkq_fin(void); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline int au_wkq_test(void) ++{ ++ return current->flags & PF_WQ_WORKER; ++} ++ ++static inline int au_wkq_wait(au_wkq_func_t func, void *args) ++{ ++ return au_wkq_do_wait(AuWkq_WAIT, func, args); ++} ++ ++static inline void au_nwt_done(struct au_nowait_tasks *nwt) ++{ ++ percpu_counter_dec(&nwt->nw_len); ++ if (!percpu_counter_sum(&nwt->nw_len)) ++ wake_up_all(&nwt->nw_wq); ++} ++ ++static inline int au_nwt_flush(struct au_nowait_tasks *nwt) ++{ ++ wait_event(nwt->nw_wq, !percpu_counter_sum(&nwt->nw_len)); ++ return 0; ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_WKQ_H__ */ +diff --git a/fs/aufs/xattr.c b/fs/aufs/xattr.c +new file mode 100644 +index 0000000..26223a2 +--- /dev/null ++++ b/fs/aufs/xattr.c +@@ -0,0 +1,344 @@ ++/* ++ * Copyright (C) 2014-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * handling xattr functions ++ */ ++ ++#include ++#include "aufs.h" ++ ++static int au_xattr_ignore(int err, char *name, unsigned int ignore_flags) ++{ ++ if (!ignore_flags) ++ goto out; ++ switch (err) { ++ case -ENOMEM: ++ case -EDQUOT: ++ goto out; ++ } ++ ++ if ((ignore_flags & AuBrAttr_ICEX) == AuBrAttr_ICEX) { ++ err = 0; ++ goto out; ++ } ++ ++#define cmp(brattr, prefix) do { \ ++ if (!strncmp(name, XATTR_##prefix##_PREFIX, \ ++ XATTR_##prefix##_PREFIX_LEN)) { \ ++ if (ignore_flags & AuBrAttr_ICEX_##brattr) \ ++ err = 0; \ ++ goto out; \ ++ } \ ++ } while (0) ++ ++ cmp(SEC, SECURITY); ++ cmp(SYS, SYSTEM); ++ cmp(TR, TRUSTED); ++ cmp(USR, USER); ++#undef cmp ++ ++ if (ignore_flags & AuBrAttr_ICEX_OTH) ++ err = 0; ++ ++out: ++ return err; ++} ++ ++static const int au_xattr_out_of_list = AuBrAttr_ICEX_OTH << 1; ++ ++static int au_do_cpup_xattr(struct dentry *h_dst, struct dentry *h_src, ++ char *name, char **buf, unsigned int ignore_flags, ++ unsigned int verbose) ++{ ++ int err; ++ ssize_t ssz; ++ struct inode *h_idst; ++ ++ ssz = vfs_getxattr_alloc(h_src, name, buf, 0, GFP_NOFS); ++ err = ssz; ++ if (unlikely(err <= 0)) { ++ if (err == -ENODATA ++ || (err == -EOPNOTSUPP ++ && ((ignore_flags & au_xattr_out_of_list) ++ || (au_test_nfs_noacl(d_inode(h_src)) ++ && (!strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) ++ || !strcmp(name, ++ XATTR_NAME_POSIX_ACL_DEFAULT)))) ++ )) ++ err = 0; ++ if (err && (verbose || au_debug_test())) ++ pr_err("%s, err %d\n", name, err); ++ goto out; ++ } ++ ++ /* unlock it temporary */ ++ h_idst = d_inode(h_dst); ++ inode_unlock(h_idst); ++ err = vfsub_setxattr(h_dst, name, *buf, ssz, /*flags*/0); ++ inode_lock_nested(h_idst, AuLsc_I_CHILD2); ++ if (unlikely(err)) { ++ if (verbose || au_debug_test()) ++ pr_err("%s, err %d\n", name, err); ++ err = au_xattr_ignore(err, name, ignore_flags); ++ } ++ ++out: ++ return err; ++} ++ ++int au_cpup_xattr(struct dentry *h_dst, struct dentry *h_src, int ignore_flags, ++ unsigned int verbose) ++{ ++ int err, unlocked, acl_access, acl_default; ++ ssize_t ssz; ++ struct inode *h_isrc, *h_idst; ++ char *value, *p, *o, *e; ++ ++ /* try stopping to update the source inode while we are referencing */ ++ /* there should not be the parent-child relationship between them */ ++ h_isrc = d_inode(h_src); ++ h_idst = d_inode(h_dst); ++ inode_unlock(h_idst); ++ inode_lock_nested(h_isrc, AuLsc_I_CHILD); ++ inode_lock_nested(h_idst, AuLsc_I_CHILD2); ++ unlocked = 0; ++ ++ /* some filesystems don't list POSIX ACL, for example tmpfs */ ++ ssz = vfs_listxattr(h_src, NULL, 0); ++ err = ssz; ++ if (unlikely(err < 0)) { ++ AuTraceErr(err); ++ if (err == -ENODATA ++ || err == -EOPNOTSUPP) ++ err = 0; /* ignore */ ++ goto out; ++ } ++ ++ err = 0; ++ p = NULL; ++ o = NULL; ++ if (ssz) { ++ err = -ENOMEM; ++ p = kmalloc(ssz, GFP_NOFS); ++ o = p; ++ if (unlikely(!p)) ++ goto out; ++ err = vfs_listxattr(h_src, p, ssz); ++ } ++ inode_unlock(h_isrc); ++ unlocked = 1; ++ AuDbg("err %d, ssz %zd\n", err, ssz); ++ if (unlikely(err < 0)) ++ goto out_free; ++ ++ err = 0; ++ e = p + ssz; ++ value = NULL; ++ acl_access = 0; ++ acl_default = 0; ++ while (!err && p < e) { ++ acl_access |= !strncmp(p, XATTR_NAME_POSIX_ACL_ACCESS, ++ sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1); ++ acl_default |= !strncmp(p, XATTR_NAME_POSIX_ACL_DEFAULT, ++ sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) ++ - 1); ++ err = au_do_cpup_xattr(h_dst, h_src, p, &value, ignore_flags, ++ verbose); ++ p += strlen(p) + 1; ++ } ++ AuTraceErr(err); ++ ignore_flags |= au_xattr_out_of_list; ++ if (!err && !acl_access) { ++ err = au_do_cpup_xattr(h_dst, h_src, ++ XATTR_NAME_POSIX_ACL_ACCESS, &value, ++ ignore_flags, verbose); ++ AuTraceErr(err); ++ } ++ if (!err && !acl_default) { ++ err = au_do_cpup_xattr(h_dst, h_src, ++ XATTR_NAME_POSIX_ACL_DEFAULT, &value, ++ ignore_flags, verbose); ++ AuTraceErr(err); ++ } ++ ++ kfree(value); ++ ++out_free: ++ kfree(o); ++out: ++ if (!unlocked) ++ inode_unlock(h_isrc); ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++enum { ++ AU_XATTR_LIST, ++ AU_XATTR_GET ++}; ++ ++struct au_lgxattr { ++ int type; ++ union { ++ struct { ++ char *list; ++ size_t size; ++ } list; ++ struct { ++ const char *name; ++ void *value; ++ size_t size; ++ } get; ++ } u; ++}; ++ ++static ssize_t au_lgxattr(struct dentry *dentry, struct au_lgxattr *arg) ++{ ++ ssize_t err; ++ struct path h_path; ++ struct super_block *sb; ++ ++ sb = dentry->d_sb; ++ err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (unlikely(err)) ++ goto out; ++ err = au_h_path_getattr(dentry, /*force*/1, &h_path); ++ if (unlikely(err)) ++ goto out_si; ++ if (unlikely(!h_path.dentry)) ++ /* illegally overlapped or something */ ++ goto out_di; /* pretending success */ ++ ++ /* always topmost entry only */ ++ switch (arg->type) { ++ case AU_XATTR_LIST: ++ err = vfs_listxattr(h_path.dentry, ++ arg->u.list.list, arg->u.list.size); ++ break; ++ case AU_XATTR_GET: ++ err = vfs_getxattr(h_path.dentry, ++ arg->u.get.name, arg->u.get.value, ++ arg->u.get.size); ++ break; ++ } ++ ++out_di: ++ di_read_unlock(dentry, AuLock_IR); ++out_si: ++ si_read_unlock(sb); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++ssize_t aufs_listxattr(struct dentry *dentry, char *list, size_t size) ++{ ++ struct au_lgxattr arg = { ++ .type = AU_XATTR_LIST, ++ .u.list = { ++ .list = list, ++ .size = size ++ }, ++ }; ++ ++ return au_lgxattr(dentry, &arg); ++} ++ ++ssize_t aufs_getxattr(struct dentry *dentry, const char *name, void *value, ++ size_t size) ++{ ++ struct au_lgxattr arg = { ++ .type = AU_XATTR_GET, ++ .u.get = { ++ .name = name, ++ .value = value, ++ .size = size ++ }, ++ }; ++ ++ return au_lgxattr(dentry, &arg); ++} ++ ++int aufs_setxattr(struct dentry *dentry, const char *name, const void *value, ++ size_t size, int flags) ++{ ++ struct au_srxattr arg = { ++ .type = AU_XATTR_SET, ++ .u.set = { ++ .name = name, ++ .value = value, ++ .size = size, ++ .flags = flags ++ }, ++ }; ++ ++ return au_srxattr(dentry, &arg); ++} ++ ++int aufs_removexattr(struct dentry *dentry, const char *name) ++{ ++ struct au_srxattr arg = { ++ .type = AU_XATTR_REMOVE, ++ .u.remove = { ++ .name = name ++ }, ++ }; ++ ++ return au_srxattr(dentry, &arg); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#if 0 ++static size_t au_xattr_list(struct dentry *dentry, char *list, size_t list_size, ++ const char *name, size_t name_len, int type) ++{ ++ return aufs_listxattr(dentry, list, list_size); ++} ++ ++static int au_xattr_get(struct dentry *dentry, const char *name, void *buffer, ++ size_t size, int type) ++{ ++ return aufs_getxattr(dentry, name, buffer, size); ++} ++ ++static int au_xattr_set(struct dentry *dentry, const char *name, ++ const void *value, size_t size, int flags, int type) ++{ ++ return aufs_setxattr(dentry, name, value, size, flags); ++} ++ ++static const struct xattr_handler au_xattr_handler = { ++ /* no prefix, no flags */ ++ .list = au_xattr_list, ++ .get = au_xattr_get, ++ .set = au_xattr_set ++ /* why no remove? */ ++}; ++ ++static const struct xattr_handler *au_xattr_handlers[] = { ++ &au_xattr_handler ++}; ++ ++void au_xattr_init(struct super_block *sb) ++{ ++ /* sb->s_xattr = au_xattr_handlers; */ ++} ++#endif +diff --git a/fs/aufs/xino.c b/fs/aufs/xino.c +new file mode 100644 +index 0000000..ed7b243 +--- /dev/null ++++ b/fs/aufs/xino.c +@@ -0,0 +1,1317 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++/* ++ * external inode number translation table and bitmap ++ */ ++ ++#include ++#include ++#include "aufs.h" ++ ++/* todo: unnecessary to support mmap_sem since kernel-space? */ ++ssize_t xino_fread(vfs_readf_t func, struct file *file, void *kbuf, size_t size, ++ loff_t *pos) ++{ ++ ssize_t err; ++ mm_segment_t oldfs; ++ union { ++ void *k; ++ char __user *u; ++ } buf; ++ ++ buf.k = kbuf; ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ do { ++ /* todo: signal_pending? */ ++ err = func(file, buf.u, size, pos); ++ } while (err == -EAGAIN || err == -EINTR); ++ set_fs(oldfs); ++ ++#if 0 /* reserved for future use */ ++ if (err > 0) ++ fsnotify_access(file->f_path.dentry); ++#endif ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static ssize_t xino_fwrite_wkq(vfs_writef_t func, struct file *file, void *buf, ++ size_t size, loff_t *pos); ++ ++static ssize_t do_xino_fwrite(vfs_writef_t func, struct file *file, void *kbuf, ++ size_t size, loff_t *pos) ++{ ++ ssize_t err; ++ mm_segment_t oldfs; ++ union { ++ void *k; ++ const char __user *u; ++ } buf; ++ int i; ++ const int prevent_endless = 10; ++ ++ i = 0; ++ buf.k = kbuf; ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ do { ++ err = func(file, buf.u, size, pos); ++ if (err == -EINTR ++ && !au_wkq_test() ++ && fatal_signal_pending(current)) { ++ set_fs(oldfs); ++ err = xino_fwrite_wkq(func, file, kbuf, size, pos); ++ BUG_ON(err == -EINTR); ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ } ++ } while (i++ < prevent_endless ++ && (err == -EAGAIN || err == -EINTR)); ++ set_fs(oldfs); ++ ++#if 0 /* reserved for future use */ ++ if (err > 0) ++ fsnotify_modify(file->f_path.dentry); ++#endif ++ ++ return err; ++} ++ ++struct do_xino_fwrite_args { ++ ssize_t *errp; ++ vfs_writef_t func; ++ struct file *file; ++ void *buf; ++ size_t size; ++ loff_t *pos; ++}; ++ ++static void call_do_xino_fwrite(void *args) ++{ ++ struct do_xino_fwrite_args *a = args; ++ *a->errp = do_xino_fwrite(a->func, a->file, a->buf, a->size, a->pos); ++} ++ ++static ssize_t xino_fwrite_wkq(vfs_writef_t func, struct file *file, void *buf, ++ size_t size, loff_t *pos) ++{ ++ ssize_t err; ++ int wkq_err; ++ struct do_xino_fwrite_args args = { ++ .errp = &err, ++ .func = func, ++ .file = file, ++ .buf = buf, ++ .size = size, ++ .pos = pos ++ }; ++ ++ /* ++ * it breaks RLIMIT_FSIZE and normal user's limit, ++ * users should care about quota and real 'filesystem full.' ++ */ ++ wkq_err = au_wkq_wait(call_do_xino_fwrite, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ ++ return err; ++} ++ ++ssize_t xino_fwrite(vfs_writef_t func, struct file *file, void *buf, ++ size_t size, loff_t *pos) ++{ ++ ssize_t err; ++ ++ if (rlimit(RLIMIT_FSIZE) == RLIM_INFINITY) { ++ lockdep_off(); ++ err = do_xino_fwrite(func, file, buf, size, pos); ++ lockdep_on(); ++ } else ++ err = xino_fwrite_wkq(func, file, buf, size, pos); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * create a new xinofile at the same place/path as @base_file. ++ */ ++struct file *au_xino_create2(struct file *base_file, struct file *copy_src) ++{ ++ struct file *file; ++ struct dentry *base, *parent; ++ struct inode *dir, *delegated; ++ struct qstr *name; ++ struct path path; ++ int err; ++ ++ base = base_file->f_path.dentry; ++ parent = base->d_parent; /* dir inode is locked */ ++ dir = d_inode(parent); ++ IMustLock(dir); ++ ++ file = ERR_PTR(-EINVAL); ++ name = &base->d_name; ++ path.dentry = vfsub_lookup_one_len(name->name, parent, name->len); ++ if (IS_ERR(path.dentry)) { ++ file = (void *)path.dentry; ++ pr_err("%pd lookup err %ld\n", ++ base, PTR_ERR(path.dentry)); ++ goto out; ++ } ++ ++ /* no need to mnt_want_write() since we call dentry_open() later */ ++ err = vfs_create(dir, path.dentry, S_IRUGO | S_IWUGO, NULL); ++ if (unlikely(err)) { ++ file = ERR_PTR(err); ++ pr_err("%pd create err %d\n", base, err); ++ goto out_dput; ++ } ++ ++ path.mnt = base_file->f_path.mnt; ++ file = vfsub_dentry_open(&path, ++ O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE ++ /* | __FMODE_NONOTIFY */); ++ if (IS_ERR(file)) { ++ pr_err("%pd open err %ld\n", base, PTR_ERR(file)); ++ goto out_dput; ++ } ++ ++ delegated = NULL; ++ err = vfsub_unlink(dir, &file->f_path, &delegated, /*force*/0); ++ if (unlikely(err == -EWOULDBLOCK)) { ++ pr_warn("cannot retry for NFSv4 delegation" ++ " for an internal unlink\n"); ++ iput(delegated); ++ } ++ if (unlikely(err)) { ++ pr_err("%pd unlink err %d\n", base, err); ++ goto out_fput; ++ } ++ ++ if (copy_src) { ++ /* no one can touch copy_src xino */ ++ err = au_copy_file(file, copy_src, vfsub_f_size_read(copy_src)); ++ if (unlikely(err)) { ++ pr_err("%pd copy err %d\n", base, err); ++ goto out_fput; ++ } ++ } ++ goto out_dput; /* success */ ++ ++out_fput: ++ fput(file); ++ file = ERR_PTR(err); ++out_dput: ++ dput(path.dentry); ++out: ++ return file; ++} ++ ++struct au_xino_lock_dir { ++ struct au_hinode *hdir; ++ struct dentry *parent; ++ struct inode *dir; ++}; ++ ++static void au_xino_lock_dir(struct super_block *sb, struct file *xino, ++ struct au_xino_lock_dir *ldir) ++{ ++ aufs_bindex_t brid, bindex; ++ ++ ldir->hdir = NULL; ++ bindex = -1; ++ brid = au_xino_brid(sb); ++ if (brid >= 0) ++ bindex = au_br_index(sb, brid); ++ if (bindex >= 0) { ++ ldir->hdir = au_hi(d_inode(sb->s_root), bindex); ++ au_hn_imtx_lock_nested(ldir->hdir, AuLsc_I_PARENT); ++ } else { ++ ldir->parent = dget_parent(xino->f_path.dentry); ++ ldir->dir = d_inode(ldir->parent); ++ inode_lock_nested(ldir->dir, AuLsc_I_PARENT); ++ } ++} ++ ++static void au_xino_unlock_dir(struct au_xino_lock_dir *ldir) ++{ ++ if (ldir->hdir) ++ au_hn_imtx_unlock(ldir->hdir); ++ else { ++ inode_unlock(ldir->dir); ++ dput(ldir->parent); ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* trucate xino files asynchronously */ ++ ++int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ int err; ++ unsigned long jiffy; ++ blkcnt_t blocks; ++ aufs_bindex_t bi, bbot; ++ struct kstatfs *st; ++ struct au_branch *br; ++ struct file *new_xino, *file; ++ struct super_block *h_sb; ++ struct au_xino_lock_dir ldir; ++ ++ err = -ENOMEM; ++ st = kmalloc(sizeof(*st), GFP_NOFS); ++ if (unlikely(!st)) ++ goto out; ++ ++ err = -EINVAL; ++ bbot = au_sbbot(sb); ++ if (unlikely(bindex < 0 || bbot < bindex)) ++ goto out_st; ++ br = au_sbr(sb, bindex); ++ file = br->br_xino.xi_file; ++ if (!file) ++ goto out_st; ++ ++ err = vfs_statfs(&file->f_path, st); ++ if (unlikely(err)) ++ AuErr1("statfs err %d, ignored\n", err); ++ jiffy = jiffies; ++ blocks = file_inode(file)->i_blocks; ++ pr_info("begin truncating xino(b%d), ib%llu, %llu/%llu free blks\n", ++ bindex, (u64)blocks, st->f_bfree, st->f_blocks); ++ ++ au_xino_lock_dir(sb, file, &ldir); ++ /* mnt_want_write() is unnecessary here */ ++ new_xino = au_xino_create2(file, file); ++ au_xino_unlock_dir(&ldir); ++ err = PTR_ERR(new_xino); ++ if (IS_ERR(new_xino)) { ++ pr_err("err %d, ignored\n", err); ++ goto out_st; ++ } ++ err = 0; ++ fput(file); ++ br->br_xino.xi_file = new_xino; ++ ++ h_sb = au_br_sb(br); ++ for (bi = 0; bi <= bbot; bi++) { ++ if (unlikely(bi == bindex)) ++ continue; ++ br = au_sbr(sb, bi); ++ if (au_br_sb(br) != h_sb) ++ continue; ++ ++ fput(br->br_xino.xi_file); ++ br->br_xino.xi_file = new_xino; ++ get_file(new_xino); ++ } ++ ++ err = vfs_statfs(&new_xino->f_path, st); ++ if (!err) { ++ pr_info("end truncating xino(b%d), ib%llu, %llu/%llu free blks\n", ++ bindex, (u64)file_inode(new_xino)->i_blocks, ++ st->f_bfree, st->f_blocks); ++ if (file_inode(new_xino)->i_blocks < blocks) ++ au_sbi(sb)->si_xino_jiffy = jiffy; ++ } else ++ AuErr1("statfs err %d, ignored\n", err); ++ ++out_st: ++ kfree(st); ++out: ++ return err; ++} ++ ++struct xino_do_trunc_args { ++ struct super_block *sb; ++ struct au_branch *br; ++}; ++ ++static void xino_do_trunc(void *_args) ++{ ++ struct xino_do_trunc_args *args = _args; ++ struct super_block *sb; ++ struct au_branch *br; ++ struct inode *dir; ++ int err; ++ aufs_bindex_t bindex; ++ ++ err = 0; ++ sb = args->sb; ++ dir = d_inode(sb->s_root); ++ br = args->br; ++ ++ si_noflush_write_lock(sb); ++ ii_read_lock_parent(dir); ++ bindex = au_br_index(sb, br->br_id); ++ err = au_xino_trunc(sb, bindex); ++ ii_read_unlock(dir); ++ if (unlikely(err)) ++ pr_warn("err b%d, (%d)\n", bindex, err); ++ atomic_dec(&br->br_xino_running); ++ au_br_put(br); ++ si_write_unlock(sb); ++ au_nwt_done(&au_sbi(sb)->si_nowait); ++ kfree(args); ++} ++ ++static int xino_trunc_test(struct super_block *sb, struct au_branch *br) ++{ ++ int err; ++ struct kstatfs st; ++ struct au_sbinfo *sbinfo; ++ ++ /* todo: si_xino_expire and the ratio should be customizable */ ++ sbinfo = au_sbi(sb); ++ if (time_before(jiffies, ++ sbinfo->si_xino_jiffy + sbinfo->si_xino_expire)) ++ return 0; ++ ++ /* truncation border */ ++ err = vfs_statfs(&br->br_xino.xi_file->f_path, &st); ++ if (unlikely(err)) { ++ AuErr1("statfs err %d, ignored\n", err); ++ return 0; ++ } ++ if (div64_u64(st.f_bfree * 100, st.f_blocks) >= AUFS_XINO_DEF_TRUNC) ++ return 0; ++ ++ return 1; ++} ++ ++static void xino_try_trunc(struct super_block *sb, struct au_branch *br) ++{ ++ struct xino_do_trunc_args *args; ++ int wkq_err; ++ ++ if (!xino_trunc_test(sb, br)) ++ return; ++ ++ if (atomic_inc_return(&br->br_xino_running) > 1) ++ goto out; ++ ++ /* lock and kfree() will be called in trunc_xino() */ ++ args = kmalloc(sizeof(*args), GFP_NOFS); ++ if (unlikely(!args)) { ++ AuErr1("no memory\n"); ++ goto out_args; ++ } ++ ++ au_br_get(br); ++ args->sb = sb; ++ args->br = br; ++ wkq_err = au_wkq_nowait(xino_do_trunc, args, sb, /*flags*/0); ++ if (!wkq_err) ++ return; /* success */ ++ ++ pr_err("wkq %d\n", wkq_err); ++ au_br_put(br); ++ ++out_args: ++ kfree(args); ++out: ++ atomic_dec(&br->br_xino_running); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_xino_do_write(vfs_writef_t write, struct file *file, ++ ino_t h_ino, ino_t ino) ++{ ++ loff_t pos; ++ ssize_t sz; ++ ++ pos = h_ino; ++ if (unlikely(au_loff_max / sizeof(ino) - 1 < pos)) { ++ AuIOErr1("too large hi%lu\n", (unsigned long)h_ino); ++ return -EFBIG; ++ } ++ pos *= sizeof(ino); ++ sz = xino_fwrite(write, file, &ino, sizeof(ino), &pos); ++ if (sz == sizeof(ino)) ++ return 0; /* success */ ++ ++ AuIOErr("write failed (%zd)\n", sz); ++ return -EIO; ++} ++ ++/* ++ * write @ino to the xinofile for the specified branch{@sb, @bindex} ++ * at the position of @h_ino. ++ * even if @ino is zero, it is written to the xinofile and means no entry. ++ * if the size of the xino file on a specific filesystem exceeds the watermark, ++ * try truncating it. ++ */ ++int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ ino_t ino) ++{ ++ int err; ++ unsigned int mnt_flags; ++ struct au_branch *br; ++ ++ BUILD_BUG_ON(sizeof(long long) != sizeof(au_loff_max) ++ || ((loff_t)-1) > 0); ++ SiMustAnyLock(sb); ++ ++ mnt_flags = au_mntflags(sb); ++ if (!au_opt_test(mnt_flags, XINO)) ++ return 0; ++ ++ br = au_sbr(sb, bindex); ++ err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file, ++ h_ino, ino); ++ if (!err) { ++ if (au_opt_test(mnt_flags, TRUNC_XINO) ++ && au_test_fs_trunc_xino(au_br_sb(br))) ++ xino_try_trunc(sb, br); ++ return 0; /* success */ ++ } ++ ++ AuIOErr("write failed (%d)\n", err); ++ return -EIO; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* aufs inode number bitmap */ ++ ++static const int page_bits = (int)PAGE_SIZE * BITS_PER_BYTE; ++static ino_t xib_calc_ino(unsigned long pindex, int bit) ++{ ++ ino_t ino; ++ ++ AuDebugOn(bit < 0 || page_bits <= bit); ++ ino = AUFS_FIRST_INO + pindex * page_bits + bit; ++ return ino; ++} ++ ++static void xib_calc_bit(ino_t ino, unsigned long *pindex, int *bit) ++{ ++ AuDebugOn(ino < AUFS_FIRST_INO); ++ ino -= AUFS_FIRST_INO; ++ *pindex = ino / page_bits; ++ *bit = ino % page_bits; ++} ++ ++static int xib_pindex(struct super_block *sb, unsigned long pindex) ++{ ++ int err; ++ loff_t pos; ++ ssize_t sz; ++ struct au_sbinfo *sbinfo; ++ struct file *xib; ++ unsigned long *p; ++ ++ sbinfo = au_sbi(sb); ++ MtxMustLock(&sbinfo->si_xib_mtx); ++ AuDebugOn(pindex > ULONG_MAX / PAGE_SIZE ++ || !au_opt_test(sbinfo->si_mntflags, XINO)); ++ ++ if (pindex == sbinfo->si_xib_last_pindex) ++ return 0; ++ ++ xib = sbinfo->si_xib; ++ p = sbinfo->si_xib_buf; ++ pos = sbinfo->si_xib_last_pindex; ++ pos *= PAGE_SIZE; ++ sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos); ++ if (unlikely(sz != PAGE_SIZE)) ++ goto out; ++ ++ pos = pindex; ++ pos *= PAGE_SIZE; ++ if (vfsub_f_size_read(xib) >= pos + PAGE_SIZE) ++ sz = xino_fread(sbinfo->si_xread, xib, p, PAGE_SIZE, &pos); ++ else { ++ memset(p, 0, PAGE_SIZE); ++ sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos); ++ } ++ if (sz == PAGE_SIZE) { ++ sbinfo->si_xib_last_pindex = pindex; ++ return 0; /* success */ ++ } ++ ++out: ++ AuIOErr1("write failed (%zd)\n", sz); ++ err = sz; ++ if (sz >= 0) ++ err = -EIO; ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_xib_clear_bit(struct inode *inode) ++{ ++ int err, bit; ++ unsigned long pindex; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ ++ AuDebugOn(inode->i_nlink); ++ ++ sb = inode->i_sb; ++ xib_calc_bit(inode->i_ino, &pindex, &bit); ++ AuDebugOn(page_bits <= bit); ++ sbinfo = au_sbi(sb); ++ mutex_lock(&sbinfo->si_xib_mtx); ++ err = xib_pindex(sb, pindex); ++ if (!err) { ++ clear_bit(bit, sbinfo->si_xib_buf); ++ sbinfo->si_xib_next_bit = bit; ++ } ++ mutex_unlock(&sbinfo->si_xib_mtx); ++} ++ ++/* for s_op->delete_inode() */ ++void au_xino_delete_inode(struct inode *inode, const int unlinked) ++{ ++ int err; ++ unsigned int mnt_flags; ++ aufs_bindex_t bindex, bbot, bi; ++ unsigned char try_trunc; ++ struct au_iinfo *iinfo; ++ struct super_block *sb; ++ struct au_hinode *hi; ++ struct inode *h_inode; ++ struct au_branch *br; ++ vfs_writef_t xwrite; ++ ++ AuDebugOn(is_bad_inode(inode)); ++ ++ sb = inode->i_sb; ++ mnt_flags = au_mntflags(sb); ++ if (!au_opt_test(mnt_flags, XINO) ++ || inode->i_ino == AUFS_ROOT_INO) ++ return; ++ ++ if (unlinked) { ++ au_xigen_inc(inode); ++ au_xib_clear_bit(inode); ++ } ++ ++ iinfo = au_ii(inode); ++ bindex = iinfo->ii_btop; ++ if (bindex < 0) ++ return; ++ ++ xwrite = au_sbi(sb)->si_xwrite; ++ try_trunc = !!au_opt_test(mnt_flags, TRUNC_XINO); ++ hi = au_hinode(iinfo, bindex); ++ bbot = iinfo->ii_bbot; ++ for (; bindex <= bbot; bindex++, hi++) { ++ h_inode = hi->hi_inode; ++ if (!h_inode ++ || (!unlinked && h_inode->i_nlink)) ++ continue; ++ ++ /* inode may not be revalidated */ ++ bi = au_br_index(sb, hi->hi_id); ++ if (bi < 0) ++ continue; ++ ++ br = au_sbr(sb, bi); ++ err = au_xino_do_write(xwrite, br->br_xino.xi_file, ++ h_inode->i_ino, /*ino*/0); ++ if (!err && try_trunc ++ && au_test_fs_trunc_xino(au_br_sb(br))) ++ xino_try_trunc(sb, br); ++ } ++} ++ ++/* get an unused inode number from bitmap */ ++ino_t au_xino_new_ino(struct super_block *sb) ++{ ++ ino_t ino; ++ unsigned long *p, pindex, ul, pend; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ int free_bit, err; ++ ++ if (!au_opt_test(au_mntflags(sb), XINO)) ++ return iunique(sb, AUFS_FIRST_INO); ++ ++ sbinfo = au_sbi(sb); ++ mutex_lock(&sbinfo->si_xib_mtx); ++ p = sbinfo->si_xib_buf; ++ free_bit = sbinfo->si_xib_next_bit; ++ if (free_bit < page_bits && !test_bit(free_bit, p)) ++ goto out; /* success */ ++ free_bit = find_first_zero_bit(p, page_bits); ++ if (free_bit < page_bits) ++ goto out; /* success */ ++ ++ pindex = sbinfo->si_xib_last_pindex; ++ for (ul = pindex - 1; ul < ULONG_MAX; ul--) { ++ err = xib_pindex(sb, ul); ++ if (unlikely(err)) ++ goto out_err; ++ free_bit = find_first_zero_bit(p, page_bits); ++ if (free_bit < page_bits) ++ goto out; /* success */ ++ } ++ ++ file = sbinfo->si_xib; ++ pend = vfsub_f_size_read(file) / PAGE_SIZE; ++ for (ul = pindex + 1; ul <= pend; ul++) { ++ err = xib_pindex(sb, ul); ++ if (unlikely(err)) ++ goto out_err; ++ free_bit = find_first_zero_bit(p, page_bits); ++ if (free_bit < page_bits) ++ goto out; /* success */ ++ } ++ BUG(); ++ ++out: ++ set_bit(free_bit, p); ++ sbinfo->si_xib_next_bit = free_bit + 1; ++ pindex = sbinfo->si_xib_last_pindex; ++ mutex_unlock(&sbinfo->si_xib_mtx); ++ ino = xib_calc_ino(pindex, free_bit); ++ AuDbg("i%lu\n", (unsigned long)ino); ++ return ino; ++out_err: ++ mutex_unlock(&sbinfo->si_xib_mtx); ++ AuDbg("i0\n"); ++ return 0; ++} ++ ++/* ++ * read @ino from xinofile for the specified branch{@sb, @bindex} ++ * at the position of @h_ino. ++ * if @ino does not exist and @do_new is true, get new one. ++ */ ++int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ ino_t *ino) ++{ ++ int err; ++ ssize_t sz; ++ loff_t pos; ++ struct file *file; ++ struct au_sbinfo *sbinfo; ++ ++ *ino = 0; ++ if (!au_opt_test(au_mntflags(sb), XINO)) ++ return 0; /* no xino */ ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ pos = h_ino; ++ if (unlikely(au_loff_max / sizeof(*ino) - 1 < pos)) { ++ AuIOErr1("too large hi%lu\n", (unsigned long)h_ino); ++ return -EFBIG; ++ } ++ pos *= sizeof(*ino); ++ ++ file = au_sbr(sb, bindex)->br_xino.xi_file; ++ if (vfsub_f_size_read(file) < pos + sizeof(*ino)) ++ return 0; /* no ino */ ++ ++ sz = xino_fread(sbinfo->si_xread, file, ino, sizeof(*ino), &pos); ++ if (sz == sizeof(*ino)) ++ return 0; /* success */ ++ ++ err = sz; ++ if (unlikely(sz >= 0)) { ++ err = -EIO; ++ AuIOErr("xino read error (%zd)\n", sz); ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* create and set a new xino file */ ++ ++struct file *au_xino_create(struct super_block *sb, char *fname, int silent) ++{ ++ struct file *file; ++ struct dentry *h_parent, *d; ++ struct inode *h_dir, *inode; ++ int err; ++ ++ /* ++ * at mount-time, and the xino file is the default path, ++ * hnotify is disabled so we have no notify events to ignore. ++ * when a user specified the xino, we cannot get au_hdir to be ignored. ++ */ ++ file = vfsub_filp_open(fname, O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE ++ /* | __FMODE_NONOTIFY */, ++ S_IRUGO | S_IWUGO); ++ if (IS_ERR(file)) { ++ if (!silent) ++ pr_err("open %s(%ld)\n", fname, PTR_ERR(file)); ++ return file; ++ } ++ ++ /* keep file count */ ++ err = 0; ++ inode = file_inode(file); ++ h_parent = dget_parent(file->f_path.dentry); ++ h_dir = d_inode(h_parent); ++ inode_lock_nested(h_dir, AuLsc_I_PARENT); ++ /* mnt_want_write() is unnecessary here */ ++ /* no delegation since it is just created */ ++ if (inode->i_nlink) ++ err = vfsub_unlink(h_dir, &file->f_path, /*delegated*/NULL, ++ /*force*/0); ++ inode_unlock(h_dir); ++ dput(h_parent); ++ if (unlikely(err)) { ++ if (!silent) ++ pr_err("unlink %s(%d)\n", fname, err); ++ goto out; ++ } ++ ++ err = -EINVAL; ++ d = file->f_path.dentry; ++ if (unlikely(sb == d->d_sb)) { ++ if (!silent) ++ pr_err("%s must be outside\n", fname); ++ goto out; ++ } ++ if (unlikely(au_test_fs_bad_xino(d->d_sb))) { ++ if (!silent) ++ pr_err("xino doesn't support %s(%s)\n", ++ fname, au_sbtype(d->d_sb)); ++ goto out; ++ } ++ return file; /* success */ ++ ++out: ++ fput(file); ++ file = ERR_PTR(err); ++ return file; ++} ++ ++/* ++ * find another branch who is on the same filesystem of the specified ++ * branch{@btgt}. search until @bbot. ++ */ ++static int is_sb_shared(struct super_block *sb, aufs_bindex_t btgt, ++ aufs_bindex_t bbot) ++{ ++ aufs_bindex_t bindex; ++ struct super_block *tgt_sb = au_sbr_sb(sb, btgt); ++ ++ for (bindex = 0; bindex < btgt; bindex++) ++ if (unlikely(tgt_sb == au_sbr_sb(sb, bindex))) ++ return bindex; ++ for (bindex++; bindex <= bbot; bindex++) ++ if (unlikely(tgt_sb == au_sbr_sb(sb, bindex))) ++ return bindex; ++ return -1; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * initialize the xinofile for the specified branch @br ++ * at the place/path where @base_file indicates. ++ * test whether another branch is on the same filesystem or not, ++ * if @do_test is true. ++ */ ++int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t h_ino, ++ struct file *base_file, int do_test) ++{ ++ int err; ++ ino_t ino; ++ aufs_bindex_t bbot, bindex; ++ struct au_branch *shared_br, *b; ++ struct file *file; ++ struct super_block *tgt_sb; ++ ++ shared_br = NULL; ++ bbot = au_sbbot(sb); ++ if (do_test) { ++ tgt_sb = au_br_sb(br); ++ for (bindex = 0; bindex <= bbot; bindex++) { ++ b = au_sbr(sb, bindex); ++ if (tgt_sb == au_br_sb(b)) { ++ shared_br = b; ++ break; ++ } ++ } ++ } ++ ++ if (!shared_br || !shared_br->br_xino.xi_file) { ++ struct au_xino_lock_dir ldir; ++ ++ au_xino_lock_dir(sb, base_file, &ldir); ++ /* mnt_want_write() is unnecessary here */ ++ file = au_xino_create2(base_file, NULL); ++ au_xino_unlock_dir(&ldir); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ br->br_xino.xi_file = file; ++ } else { ++ br->br_xino.xi_file = shared_br->br_xino.xi_file; ++ get_file(br->br_xino.xi_file); ++ } ++ ++ ino = AUFS_ROOT_INO; ++ err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file, ++ h_ino, ino); ++ if (unlikely(err)) { ++ fput(br->br_xino.xi_file); ++ br->br_xino.xi_file = NULL; ++ } ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* trucate a xino bitmap file */ ++ ++/* todo: slow */ ++static int do_xib_restore(struct super_block *sb, struct file *file, void *page) ++{ ++ int err, bit; ++ ssize_t sz; ++ unsigned long pindex; ++ loff_t pos, pend; ++ struct au_sbinfo *sbinfo; ++ vfs_readf_t func; ++ ino_t *ino; ++ unsigned long *p; ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ MtxMustLock(&sbinfo->si_xib_mtx); ++ p = sbinfo->si_xib_buf; ++ func = sbinfo->si_xread; ++ pend = vfsub_f_size_read(file); ++ pos = 0; ++ while (pos < pend) { ++ sz = xino_fread(func, file, page, PAGE_SIZE, &pos); ++ err = sz; ++ if (unlikely(sz <= 0)) ++ goto out; ++ ++ err = 0; ++ for (ino = page; sz > 0; ino++, sz -= sizeof(ino)) { ++ if (unlikely(*ino < AUFS_FIRST_INO)) ++ continue; ++ ++ xib_calc_bit(*ino, &pindex, &bit); ++ AuDebugOn(page_bits <= bit); ++ err = xib_pindex(sb, pindex); ++ if (!err) ++ set_bit(bit, p); ++ else ++ goto out; ++ } ++ } ++ ++out: ++ return err; ++} ++ ++static int xib_restore(struct super_block *sb) ++{ ++ int err; ++ aufs_bindex_t bindex, bbot; ++ void *page; ++ ++ err = -ENOMEM; ++ page = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!page)) ++ goto out; ++ ++ err = 0; ++ bbot = au_sbbot(sb); ++ for (bindex = 0; !err && bindex <= bbot; bindex++) ++ if (!bindex || is_sb_shared(sb, bindex, bindex - 1) < 0) ++ err = do_xib_restore ++ (sb, au_sbr(sb, bindex)->br_xino.xi_file, page); ++ else ++ AuDbg("b%d\n", bindex); ++ free_page((unsigned long)page); ++ ++out: ++ return err; ++} ++ ++int au_xib_trunc(struct super_block *sb) ++{ ++ int err; ++ ssize_t sz; ++ loff_t pos; ++ struct au_xino_lock_dir ldir; ++ struct au_sbinfo *sbinfo; ++ unsigned long *p; ++ struct file *file; ++ ++ SiMustWriteLock(sb); ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ if (!au_opt_test(sbinfo->si_mntflags, XINO)) ++ goto out; ++ ++ file = sbinfo->si_xib; ++ if (vfsub_f_size_read(file) <= PAGE_SIZE) ++ goto out; ++ ++ au_xino_lock_dir(sb, file, &ldir); ++ /* mnt_want_write() is unnecessary here */ ++ file = au_xino_create2(sbinfo->si_xib, NULL); ++ au_xino_unlock_dir(&ldir); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ fput(sbinfo->si_xib); ++ sbinfo->si_xib = file; ++ ++ p = sbinfo->si_xib_buf; ++ memset(p, 0, PAGE_SIZE); ++ pos = 0; ++ sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xib, p, PAGE_SIZE, &pos); ++ if (unlikely(sz != PAGE_SIZE)) { ++ err = sz; ++ AuIOErr("err %d\n", err); ++ if (sz >= 0) ++ err = -EIO; ++ goto out; ++ } ++ ++ mutex_lock(&sbinfo->si_xib_mtx); ++ /* mnt_want_write() is unnecessary here */ ++ err = xib_restore(sb); ++ mutex_unlock(&sbinfo->si_xib_mtx); ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * xino mount option handlers ++ */ ++ ++/* xino bitmap */ ++static void xino_clear_xib(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ sbinfo->si_xread = NULL; ++ sbinfo->si_xwrite = NULL; ++ if (sbinfo->si_xib) ++ fput(sbinfo->si_xib); ++ sbinfo->si_xib = NULL; ++ free_page((unsigned long)sbinfo->si_xib_buf); ++ sbinfo->si_xib_buf = NULL; ++} ++ ++static int au_xino_set_xib(struct super_block *sb, struct file *base) ++{ ++ int err; ++ loff_t pos; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ file = au_xino_create2(base, sbinfo->si_xib); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ if (sbinfo->si_xib) ++ fput(sbinfo->si_xib); ++ sbinfo->si_xib = file; ++ sbinfo->si_xread = vfs_readf(file); ++ sbinfo->si_xwrite = vfs_writef(file); ++ ++ err = -ENOMEM; ++ if (!sbinfo->si_xib_buf) ++ sbinfo->si_xib_buf = (void *)get_zeroed_page(GFP_NOFS); ++ if (unlikely(!sbinfo->si_xib_buf)) ++ goto out_unset; ++ ++ sbinfo->si_xib_last_pindex = 0; ++ sbinfo->si_xib_next_bit = 0; ++ if (vfsub_f_size_read(file) < PAGE_SIZE) { ++ pos = 0; ++ err = xino_fwrite(sbinfo->si_xwrite, file, sbinfo->si_xib_buf, ++ PAGE_SIZE, &pos); ++ if (unlikely(err != PAGE_SIZE)) ++ goto out_free; ++ } ++ err = 0; ++ goto out; /* success */ ++ ++out_free: ++ free_page((unsigned long)sbinfo->si_xib_buf); ++ sbinfo->si_xib_buf = NULL; ++ if (err >= 0) ++ err = -EIO; ++out_unset: ++ fput(sbinfo->si_xib); ++ sbinfo->si_xib = NULL; ++ sbinfo->si_xread = NULL; ++ sbinfo->si_xwrite = NULL; ++out: ++ return err; ++} ++ ++/* xino for each branch */ ++static void xino_clear_br(struct super_block *sb) ++{ ++ aufs_bindex_t bindex, bbot; ++ struct au_branch *br; ++ ++ bbot = au_sbbot(sb); ++ for (bindex = 0; bindex <= bbot; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (!br || !br->br_xino.xi_file) ++ continue; ++ ++ fput(br->br_xino.xi_file); ++ br->br_xino.xi_file = NULL; ++ } ++} ++ ++static int au_xino_set_br(struct super_block *sb, struct file *base) ++{ ++ int err; ++ ino_t ino; ++ aufs_bindex_t bindex, bbot, bshared; ++ struct { ++ struct file *old, *new; ++ } *fpair, *p; ++ struct au_branch *br; ++ struct inode *inode; ++ vfs_writef_t writef; ++ ++ SiMustWriteLock(sb); ++ ++ err = -ENOMEM; ++ bbot = au_sbbot(sb); ++ fpair = kcalloc(bbot + 1, sizeof(*fpair), GFP_NOFS); ++ if (unlikely(!fpair)) ++ goto out; ++ ++ inode = d_inode(sb->s_root); ++ ino = AUFS_ROOT_INO; ++ writef = au_sbi(sb)->si_xwrite; ++ for (bindex = 0, p = fpair; bindex <= bbot; bindex++, p++) { ++ br = au_sbr(sb, bindex); ++ bshared = is_sb_shared(sb, bindex, bindex - 1); ++ if (bshared >= 0) { ++ /* shared xino */ ++ *p = fpair[bshared]; ++ get_file(p->new); ++ } ++ ++ if (!p->new) { ++ /* new xino */ ++ p->old = br->br_xino.xi_file; ++ p->new = au_xino_create2(base, br->br_xino.xi_file); ++ err = PTR_ERR(p->new); ++ if (IS_ERR(p->new)) { ++ p->new = NULL; ++ goto out_pair; ++ } ++ } ++ ++ err = au_xino_do_write(writef, p->new, ++ au_h_iptr(inode, bindex)->i_ino, ino); ++ if (unlikely(err)) ++ goto out_pair; ++ } ++ ++ for (bindex = 0, p = fpair; bindex <= bbot; bindex++, p++) { ++ br = au_sbr(sb, bindex); ++ if (br->br_xino.xi_file) ++ fput(br->br_xino.xi_file); ++ get_file(p->new); ++ br->br_xino.xi_file = p->new; ++ } ++ ++out_pair: ++ for (bindex = 0, p = fpair; bindex <= bbot; bindex++, p++) ++ if (p->new) ++ fput(p->new); ++ else ++ break; ++ kfree(fpair); ++out: ++ return err; ++} ++ ++void au_xino_clr(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ au_xigen_clr(sb); ++ xino_clear_xib(sb); ++ xino_clear_br(sb); ++ sbinfo = au_sbi(sb); ++ /* lvalue, do not call au_mntflags() */ ++ au_opt_clr(sbinfo->si_mntflags, XINO); ++} ++ ++int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount) ++{ ++ int err, skip; ++ struct dentry *parent, *cur_parent; ++ struct qstr *dname, *cur_name; ++ struct file *cur_xino; ++ struct inode *dir; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ parent = dget_parent(xino->file->f_path.dentry); ++ if (remount) { ++ skip = 0; ++ dname = &xino->file->f_path.dentry->d_name; ++ cur_xino = sbinfo->si_xib; ++ if (cur_xino) { ++ cur_parent = dget_parent(cur_xino->f_path.dentry); ++ cur_name = &cur_xino->f_path.dentry->d_name; ++ skip = (cur_parent == parent ++ && au_qstreq(dname, cur_name)); ++ dput(cur_parent); ++ } ++ if (skip) ++ goto out; ++ } ++ ++ au_opt_set(sbinfo->si_mntflags, XINO); ++ dir = d_inode(parent); ++ inode_lock_nested(dir, AuLsc_I_PARENT); ++ /* mnt_want_write() is unnecessary here */ ++ err = au_xino_set_xib(sb, xino->file); ++ if (!err) ++ err = au_xigen_set(sb, xino->file); ++ if (!err) ++ err = au_xino_set_br(sb, xino->file); ++ inode_unlock(dir); ++ if (!err) ++ goto out; /* success */ ++ ++ /* reset all */ ++ AuIOErr("failed creating xino(%d).\n", err); ++ au_xigen_clr(sb); ++ xino_clear_xib(sb); ++ ++out: ++ dput(parent); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * create a xinofile at the default place/path. ++ */ ++struct file *au_xino_def(struct super_block *sb) ++{ ++ struct file *file; ++ char *page, *p; ++ struct au_branch *br; ++ struct super_block *h_sb; ++ struct path path; ++ aufs_bindex_t bbot, bindex, bwr; ++ ++ br = NULL; ++ bbot = au_sbbot(sb); ++ bwr = -1; ++ for (bindex = 0; bindex <= bbot; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (au_br_writable(br->br_perm) ++ && !au_test_fs_bad_xino(au_br_sb(br))) { ++ bwr = bindex; ++ break; ++ } ++ } ++ ++ if (bwr >= 0) { ++ file = ERR_PTR(-ENOMEM); ++ page = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!page)) ++ goto out; ++ path.mnt = au_br_mnt(br); ++ path.dentry = au_h_dptr(sb->s_root, bwr); ++ p = d_path(&path, page, PATH_MAX - sizeof(AUFS_XINO_FNAME)); ++ file = (void *)p; ++ if (!IS_ERR(p)) { ++ strcat(p, "/" AUFS_XINO_FNAME); ++ AuDbg("%s\n", p); ++ file = au_xino_create(sb, p, /*silent*/0); ++ if (!IS_ERR(file)) ++ au_xino_brid_set(sb, br->br_id); ++ } ++ free_page((unsigned long)page); ++ } else { ++ file = au_xino_create(sb, AUFS_XINO_DEFPATH, /*silent*/0); ++ if (IS_ERR(file)) ++ goto out; ++ h_sb = file->f_path.dentry->d_sb; ++ if (unlikely(au_test_fs_bad_xino(h_sb))) { ++ pr_err("xino doesn't support %s(%s)\n", ++ AUFS_XINO_DEFPATH, au_sbtype(h_sb)); ++ fput(file); ++ file = ERR_PTR(-EINVAL); ++ } ++ if (!IS_ERR(file)) ++ au_xino_brid_set(sb, -1); ++ } ++ ++out: ++ return file; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_xino_path(struct seq_file *seq, struct file *file) ++{ ++ int err; ++ ++ err = au_seq_path(seq, &file->f_path); ++ if (unlikely(err)) ++ goto out; ++ ++#define Deleted "\\040(deleted)" ++ seq->count -= sizeof(Deleted) - 1; ++ AuDebugOn(memcmp(seq->buf + seq->count, Deleted, ++ sizeof(Deleted) - 1)); ++#undef Deleted ++ ++out: ++ return err; ++} +diff --git a/fs/dcache.c b/fs/dcache.c +index d5ecc6e..0dd0237 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -1156,7 +1156,7 @@ enum d_walk_ret { + * + * The @enter() and @finish() callbacks are called with d_lock held. + */ +-static void d_walk(struct dentry *parent, void *data, ++void d_walk(struct dentry *parent, void *data, + enum d_walk_ret (*enter)(void *, struct dentry *), + void (*finish)(void *)) + { +@@ -1261,6 +1261,7 @@ rename_retry: + seq = 1; + goto again; + } ++EXPORT_SYMBOL_GPL(d_walk); + + /* + * Search for at least 1 mount point in the dentry's subdirs. +diff --git a/fs/exec.c b/fs/exec.c +index c4010b8..c2b225f 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -104,6 +104,7 @@ bool path_noexec(const struct path *path) + return (path->mnt->mnt_flags & MNT_NOEXEC) || + (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); + } ++EXPORT_SYMBOL_GPL(path_noexec); + + #ifdef CONFIG_USELIB + /* +diff --git a/fs/fcntl.c b/fs/fcntl.c +index 350a2c8..04fd33c 100644 +--- a/fs/fcntl.c ++++ b/fs/fcntl.c +@@ -29,7 +29,7 @@ + + #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) + +-static int setfl(int fd, struct file * filp, unsigned long arg) ++int setfl(int fd, struct file * filp, unsigned long arg) + { + struct inode * inode = file_inode(filp); + int error = 0; +@@ -60,6 +60,8 @@ static int setfl(int fd, struct file * filp, unsigned long arg) + + if (filp->f_op->check_flags) + error = filp->f_op->check_flags(arg); ++ if (!error && filp->f_op->setfl) ++ error = filp->f_op->setfl(filp, arg); + if (error) + return error; + +@@ -80,6 +82,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) + out: + return error; + } ++EXPORT_SYMBOL_GPL(setfl); + + static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, + int force) +diff --git a/fs/file_table.c b/fs/file_table.c +index ad17e05..ae9f267 100644 +--- a/fs/file_table.c ++++ b/fs/file_table.c +@@ -147,6 +147,7 @@ over: + } + return ERR_PTR(-ENFILE); + } ++EXPORT_SYMBOL_GPL(get_empty_filp); + + /** + * alloc_file - allocate and initialize a 'struct file' +@@ -258,6 +259,7 @@ void flush_delayed_fput(void) + { + delayed_fput(NULL); + } ++EXPORT_SYMBOL_GPL(flush_delayed_fput); + + static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); + +@@ -300,6 +302,7 @@ void __fput_sync(struct file *file) + } + + EXPORT_SYMBOL(fput); ++EXPORT_SYMBOL_GPL(__fput_sync); + + void put_filp(struct file *file) + { +@@ -308,6 +311,7 @@ void put_filp(struct file *file) + file_free(file); + } + } ++EXPORT_SYMBOL_GPL(put_filp); + + void __init files_init(void) + { +diff --git a/fs/inode.c b/fs/inode.c +index 69b8b52..f46e5c6 100644 +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -850,6 +850,8 @@ unsigned int get_next_ino(void) + unsigned int *p = &get_cpu_var(last_ino); + unsigned int res = *p; + ++start: ++ + #ifdef CONFIG_SMP + if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { + static atomic_t shared_last_ino; +@@ -862,7 +864,7 @@ unsigned int get_next_ino(void) + res++; + /* get_next_ino should not provide a 0 inode number */ + if (unlikely(!res)) +- res++; ++ goto start; + *p = res; + put_cpu_var(last_ino); + return res; +diff --git a/fs/namespace.c b/fs/namespace.c +index 4fb1691..97654d2 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -463,6 +463,7 @@ void __mnt_drop_write(struct vfsmount *mnt) + mnt_dec_writers(real_mount(mnt)); + preempt_enable(); + } ++EXPORT_SYMBOL_GPL(__mnt_drop_write); + + /** + * mnt_drop_write - give up write access to a mount +@@ -1811,6 +1812,7 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, + } + return 0; + } ++EXPORT_SYMBOL_GPL(iterate_mounts); + + static void cleanup_group_ids(struct mount *mnt, struct mount *end) + { +diff --git a/fs/notify/group.c b/fs/notify/group.c +index d16b62c..53e45b6 100644 +--- a/fs/notify/group.c ++++ b/fs/notify/group.c +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + + #include + #include "fsnotify.h" +@@ -72,6 +73,7 @@ void fsnotify_get_group(struct fsnotify_group *group) + { + atomic_inc(&group->refcnt); + } ++EXPORT_SYMBOL_GPL(fsnotify_get_group); + + /* + * Drop a reference to a group. Free it if it's through. +@@ -81,6 +83,7 @@ void fsnotify_put_group(struct fsnotify_group *group) + if (atomic_dec_and_test(&group->refcnt)) + fsnotify_final_destroy_group(group); + } ++EXPORT_SYMBOL_GPL(fsnotify_put_group); + + /* + * Create a new fsnotify_group and hold a reference for the group returned. +@@ -109,6 +112,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) + + return group; + } ++EXPORT_SYMBOL_GPL(fsnotify_alloc_group); + + int fsnotify_fasync(int fd, struct file *file, int on) + { +diff --git a/fs/notify/mark.c b/fs/notify/mark.c +index 7115c5d..ac2bd69 100644 +--- a/fs/notify/mark.c ++++ b/fs/notify/mark.c +@@ -113,6 +113,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) + mark->free_mark(mark); + } + } ++EXPORT_SYMBOL_GPL(fsnotify_put_mark); + + /* Calculate mask of events for a list of marks */ + u32 fsnotify_recalc_mask(struct hlist_head *head) +@@ -213,6 +214,7 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, + mutex_unlock(&group->mark_mutex); + fsnotify_free_mark(mark); + } ++EXPORT_SYMBOL_GPL(fsnotify_destroy_mark); + + void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock) + { +@@ -398,6 +400,7 @@ err: + + return ret; + } ++EXPORT_SYMBOL_GPL(fsnotify_add_mark); + + int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, + struct inode *inode, struct vfsmount *mnt, int allow_dups) +@@ -498,6 +501,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, + atomic_set(&mark->refcnt, 1); + mark->free_mark = free_mark; + } ++EXPORT_SYMBOL_GPL(fsnotify_init_mark); + + static void fsnotify_mark_destroy(struct work_struct *work) + { +diff --git a/fs/open.c b/fs/open.c +index 081d3d6..b4359e4 100644 +--- a/fs/open.c ++++ b/fs/open.c +@@ -64,6 +64,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, + inode_unlock(dentry->d_inode); + return ret; + } ++EXPORT_SYMBOL_GPL(do_truncate); + + long vfs_truncate(struct path *path, loff_t length) + { +@@ -678,6 +679,7 @@ int open_check_o_direct(struct file *f) + } + return 0; + } ++EXPORT_SYMBOL_GPL(open_check_o_direct); + + static int do_dentry_open(struct file *f, + struct inode *inode, +diff --git a/fs/proc/base.c b/fs/proc/base.c +index 0d163a8..b958f79 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -1934,7 +1934,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) + down_read(&mm->mmap_sem); + vma = find_exact_vma(mm, vm_start, vm_end); + if (vma && vma->vm_file) { +- *path = vma->vm_file->f_path; ++ *path = vma_pr_or_file(vma)->f_path; + path_get(path); + rc = 0; + } +diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c +index f8595e8..cb8eda0 100644 +--- a/fs/proc/nommu.c ++++ b/fs/proc/nommu.c +@@ -45,7 +45,10 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) + file = region->vm_file; + + if (file) { +- struct inode *inode = file_inode(region->vm_file); ++ struct inode *inode; ++ ++ file = vmr_pr_or_file(region); ++ inode = file_inode(file); + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + } +diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c +index 5415835..c41eb73 100644 +--- a/fs/proc/task_mmu.c ++++ b/fs/proc/task_mmu.c +@@ -298,7 +298,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) + const char *name = NULL; + + if (file) { +- struct inode *inode = file_inode(vma->vm_file); ++ struct inode *inode; ++ ++ file = vma_pr_or_file(vma); ++ inode = file_inode(file); + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; +@@ -1617,7 +1620,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) + struct proc_maps_private *proc_priv = &numa_priv->proc_maps; + struct vm_area_struct *vma = v; + struct numa_maps *md = &numa_priv->md; +- struct file *file = vma->vm_file; ++ struct file *file = vma_pr_or_file(vma); + struct mm_struct *mm = vma->vm_mm; + struct mm_walk walk = { + .hugetlb_entry = gather_hugetlb_stats, +diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c +index faacb0c..17b43be 100644 +--- a/fs/proc/task_nommu.c ++++ b/fs/proc/task_nommu.c +@@ -163,7 +163,10 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, + file = vma->vm_file; + + if (file) { +- struct inode *inode = file_inode(vma->vm_file); ++ struct inode *inode; ++ ++ file = vma_pr_or_file(vma); ++ inode = file_inode(file); + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; +diff --git a/fs/read_write.c b/fs/read_write.c +index cf377cf..0a43d7b 100644 +--- a/fs/read_write.c ++++ b/fs/read_write.c +@@ -534,6 +534,30 @@ ssize_t __vfs_write(struct file *file, const char __user *p, size_t count, + } + EXPORT_SYMBOL(__vfs_write); + ++vfs_readf_t vfs_readf(struct file *file) ++{ ++ const struct file_operations *fop = file->f_op; ++ ++ if (fop->read) ++ return fop->read; ++ if (fop->read_iter) ++ return new_sync_read; ++ return ERR_PTR(-ENOSYS); ++} ++EXPORT_SYMBOL_GPL(vfs_readf); ++ ++vfs_writef_t vfs_writef(struct file *file) ++{ ++ const struct file_operations *fop = file->f_op; ++ ++ if (fop->write) ++ return fop->write; ++ if (fop->write_iter) ++ return new_sync_write; ++ return ERR_PTR(-ENOSYS); ++} ++EXPORT_SYMBOL_GPL(vfs_writef); ++ + ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) + { + mm_segment_t old_fs; +diff --git a/fs/splice.c b/fs/splice.c +index dd9bf7e..0606690 100644 +--- a/fs/splice.c ++++ b/fs/splice.c +@@ -1111,8 +1111,8 @@ EXPORT_SYMBOL(generic_splice_sendpage); + /* + * Attempt to initiate a splice from pipe to file. + */ +-static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, +- loff_t *ppos, size_t len, unsigned int flags) ++long do_splice_from(struct pipe_inode_info *pipe, struct file *out, ++ loff_t *ppos, size_t len, unsigned int flags) + { + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, + loff_t *, size_t, unsigned int); +@@ -1124,13 +1124,14 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + + return splice_write(pipe, out, ppos, len, flags); + } ++EXPORT_SYMBOL_GPL(do_splice_from); + + /* + * Attempt to initiate a splice from a file to a pipe. + */ +-static long do_splice_to(struct file *in, loff_t *ppos, +- struct pipe_inode_info *pipe, size_t len, +- unsigned int flags) ++long do_splice_to(struct file *in, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags) + { + ssize_t (*splice_read)(struct file *, loff_t *, + struct pipe_inode_info *, size_t, unsigned int); +@@ -1153,6 +1154,7 @@ static long do_splice_to(struct file *in, loff_t *ppos, + + return splice_read(in, ppos, pipe, len, flags); + } ++EXPORT_SYMBOL_GPL(do_splice_to); + + /** + * splice_direct_to_actor - splices data directly between two non-pipes +diff --git a/fs/xattr.c b/fs/xattr.c +index 4861322..c4bb039 100644 +--- a/fs/xattr.c ++++ b/fs/xattr.c +@@ -207,6 +207,7 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, + *xattr_value = value; + return error; + } ++EXPORT_SYMBOL_GPL(vfs_getxattr_alloc); + + ssize_t + vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) +diff --git a/include/linux/file.h b/include/linux/file.h +index f87d308..9a290b3 100644 +--- a/include/linux/file.h ++++ b/include/linux/file.h +@@ -19,6 +19,7 @@ struct dentry; + struct path; + extern struct file *alloc_file(struct path *, fmode_t mode, + const struct file_operations *fop); ++extern struct file *get_empty_filp(void); + + static inline void fput_light(struct file *file, int fput_needed) + { +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 70e61b5..351bb05 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -1277,6 +1277,7 @@ extern void fasync_free(struct fasync_struct *); + /* can be called from interrupts */ + extern void kill_fasync(struct fasync_struct **, int, int); + ++extern int setfl(int fd, struct file * filp, unsigned long arg); + extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force); + extern void f_setown(struct file *filp, unsigned long arg, int force); + extern void f_delown(struct file *filp); +@@ -1660,6 +1661,7 @@ struct file_operations { + ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); + unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + int (*check_flags)(int); ++ int (*setfl)(struct file *, unsigned long); + int (*flock) (struct file *, int, struct file_lock *); + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); + ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); +@@ -1718,6 +1720,12 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, + struct iovec *fast_pointer, + struct iovec **ret_pointer); + ++typedef ssize_t (*vfs_readf_t)(struct file *, char __user *, size_t, loff_t *); ++typedef ssize_t (*vfs_writef_t)(struct file *, const char __user *, size_t, ++ loff_t *); ++vfs_readf_t vfs_readf(struct file *file); ++vfs_writef_t vfs_writef(struct file *file); ++ + extern ssize_t __vfs_read(struct file *, char __user *, size_t, loff_t *); + extern ssize_t __vfs_write(struct file *, const char __user *, size_t, loff_t *); + extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 02f7f31..689b05b 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1251,6 +1251,28 @@ static inline int fixup_user_fault(struct task_struct *tsk, + } + #endif + ++extern void vma_do_file_update_time(struct vm_area_struct *, const char[], int); ++extern struct file *vma_do_pr_or_file(struct vm_area_struct *, const char[], ++ int); ++extern void vma_do_get_file(struct vm_area_struct *, const char[], int); ++extern void vma_do_fput(struct vm_area_struct *, const char[], int); ++ ++#define vma_file_update_time(vma) vma_do_file_update_time(vma, __func__, \ ++ __LINE__) ++#define vma_pr_or_file(vma) vma_do_pr_or_file(vma, __func__, \ ++ __LINE__) ++#define vma_get_file(vma) vma_do_get_file(vma, __func__, __LINE__) ++#define vma_fput(vma) vma_do_fput(vma, __func__, __LINE__) ++ ++#ifndef CONFIG_MMU ++extern struct file *vmr_do_pr_or_file(struct vm_region *, const char[], int); ++extern void vmr_do_fput(struct vm_region *, const char[], int); ++ ++#define vmr_pr_or_file(region) vmr_do_pr_or_file(region, __func__, \ ++ __LINE__) ++#define vmr_fput(region) vmr_do_fput(region, __func__, __LINE__) ++#endif /* !CONFIG_MMU */ ++ + extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); + extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, int write); +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index c2d75b4..9e324fe 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -269,6 +269,7 @@ struct vm_region { + unsigned long vm_top; /* region allocated to here */ + unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ + struct file *vm_file; /* the backing file or NULL */ ++ struct file *vm_prfile; /* the virtual backing file or NULL */ + + int vm_usage; /* region usage count (access under nommu_region_sem) */ + bool vm_icache_flushed : 1; /* true if the icache has been flushed for +@@ -343,6 +344,7 @@ struct vm_area_struct { + unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE + units */ + struct file * vm_file; /* File we map to (can be NULL). */ ++ struct file *vm_prfile; /* shadow of vm_file */ + void * vm_private_data; /* was vm_pte (shared mem) */ + + #ifndef CONFIG_MMU +diff --git a/include/linux/splice.h b/include/linux/splice.h +index da2751d..2e0fca6 100644 +--- a/include/linux/splice.h ++++ b/include/linux/splice.h +@@ -83,4 +83,10 @@ extern void splice_shrink_spd(struct splice_pipe_desc *); + extern void spd_release_page(struct splice_pipe_desc *, unsigned int); + + extern const struct pipe_buf_operations page_cache_pipe_buf_ops; ++ ++extern long do_splice_from(struct pipe_inode_info *pipe, struct file *out, ++ loff_t *ppos, size_t len, unsigned int flags); ++extern long do_splice_to(struct file *in, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags); + #endif +diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild +index 813ffb2e..ac2202e 100644 +--- a/include/uapi/linux/Kbuild ++++ b/include/uapi/linux/Kbuild +@@ -59,6 +59,7 @@ header-y += atmsvc.h + header-y += atm_tcp.h + header-y += atm_zatm.h + header-y += audit.h ++header-y += aufs_type.h + header-y += auto_fs4.h + header-y += auto_fs.h + header-y += auxvec.h +diff --git a/include/uapi/linux/aufs_type.h b/include/uapi/linux/aufs_type.h +new file mode 100644 +index 0000000..e27dde7 +--- /dev/null ++++ b/include/uapi/linux/aufs_type.h +@@ -0,0 +1,419 @@ ++/* ++ * Copyright (C) 2005-2016 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++#ifndef __AUFS_TYPE_H__ ++#define __AUFS_TYPE_H__ ++ ++#define AUFS_NAME "aufs" ++ ++#ifdef __KERNEL__ ++/* ++ * define it before including all other headers. ++ * sched.h may use pr_* macros before defining "current", so define the ++ * no-current version first, and re-define later. ++ */ ++#define pr_fmt(fmt) AUFS_NAME " %s:%d: " fmt, __func__, __LINE__ ++#include ++#undef pr_fmt ++#define pr_fmt(fmt) \ ++ AUFS_NAME " %s:%d:%.*s[%d]: " fmt, __func__, __LINE__, \ ++ (int)sizeof(current->comm), current->comm, current->pid ++#else ++#include ++#include ++#endif /* __KERNEL__ */ ++ ++#include ++ ++#define AUFS_VERSION "4.6-20160530" ++ ++/* todo? move this to linux-2.6.19/include/magic.h */ ++#define AUFS_SUPER_MAGIC ('a' << 24 | 'u' << 16 | 'f' << 8 | 's') ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_BRANCH_MAX_127 ++typedef int8_t aufs_bindex_t; ++#define AUFS_BRANCH_MAX 127 ++#else ++typedef int16_t aufs_bindex_t; ++#ifdef CONFIG_AUFS_BRANCH_MAX_511 ++#define AUFS_BRANCH_MAX 511 ++#elif defined(CONFIG_AUFS_BRANCH_MAX_1023) ++#define AUFS_BRANCH_MAX 1023 ++#elif defined(CONFIG_AUFS_BRANCH_MAX_32767) ++#define AUFS_BRANCH_MAX 32767 ++#endif ++#endif ++ ++#ifdef __KERNEL__ ++#ifndef AUFS_BRANCH_MAX ++#error unknown CONFIG_AUFS_BRANCH_MAX value ++#endif ++#endif /* __KERNEL__ */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define AUFS_FSTYPE AUFS_NAME ++ ++#define AUFS_ROOT_INO 2 ++#define AUFS_FIRST_INO 11 ++ ++#define AUFS_WH_PFX ".wh." ++#define AUFS_WH_PFX_LEN ((int)sizeof(AUFS_WH_PFX) - 1) ++#define AUFS_WH_TMP_LEN 4 ++/* a limit for rmdir/rename a dir and copyup */ ++#define AUFS_MAX_NAMELEN (NAME_MAX \ ++ - AUFS_WH_PFX_LEN * 2 /* doubly whiteouted */\ ++ - 1 /* dot */\ ++ - AUFS_WH_TMP_LEN) /* hex */ ++#define AUFS_XINO_FNAME "." AUFS_NAME ".xino" ++#define AUFS_XINO_DEFPATH "/tmp/" AUFS_XINO_FNAME ++#define AUFS_XINO_DEF_SEC 30 /* seconds */ ++#define AUFS_XINO_DEF_TRUNC 45 /* percentage */ ++#define AUFS_DIRWH_DEF 3 ++#define AUFS_RDCACHE_DEF 10 /* seconds */ ++#define AUFS_RDCACHE_MAX 3600 /* seconds */ ++#define AUFS_RDBLK_DEF 512 /* bytes */ ++#define AUFS_RDHASH_DEF 32 ++#define AUFS_WKQ_NAME AUFS_NAME "d" ++#define AUFS_MFS_DEF_SEC 30 /* seconds */ ++#define AUFS_MFS_MAX_SEC 3600 /* seconds */ ++#define AUFS_FHSM_CACHE_DEF_SEC 30 /* seconds */ ++#define AUFS_PLINK_WARN 50 /* number of plinks in a single bucket */ ++ ++/* pseudo-link maintenace under /proc */ ++#define AUFS_PLINK_MAINT_NAME "plink_maint" ++#define AUFS_PLINK_MAINT_DIR "fs/" AUFS_NAME ++#define AUFS_PLINK_MAINT_PATH AUFS_PLINK_MAINT_DIR "/" AUFS_PLINK_MAINT_NAME ++ ++#define AUFS_DIROPQ_NAME AUFS_WH_PFX ".opq" /* whiteouted doubly */ ++#define AUFS_WH_DIROPQ AUFS_WH_PFX AUFS_DIROPQ_NAME ++ ++#define AUFS_BASE_NAME AUFS_WH_PFX AUFS_NAME ++#define AUFS_PLINKDIR_NAME AUFS_WH_PFX "plnk" ++#define AUFS_ORPHDIR_NAME AUFS_WH_PFX "orph" ++ ++/* doubly whiteouted */ ++#define AUFS_WH_BASE AUFS_WH_PFX AUFS_BASE_NAME ++#define AUFS_WH_PLINKDIR AUFS_WH_PFX AUFS_PLINKDIR_NAME ++#define AUFS_WH_ORPHDIR AUFS_WH_PFX AUFS_ORPHDIR_NAME ++ ++/* branch permissions and attributes */ ++#define AUFS_BRPERM_RW "rw" ++#define AUFS_BRPERM_RO "ro" ++#define AUFS_BRPERM_RR "rr" ++#define AUFS_BRATTR_COO_REG "coo_reg" ++#define AUFS_BRATTR_COO_ALL "coo_all" ++#define AUFS_BRATTR_FHSM "fhsm" ++#define AUFS_BRATTR_UNPIN "unpin" ++#define AUFS_BRATTR_ICEX "icex" ++#define AUFS_BRATTR_ICEX_SEC "icexsec" ++#define AUFS_BRATTR_ICEX_SYS "icexsys" ++#define AUFS_BRATTR_ICEX_TR "icextr" ++#define AUFS_BRATTR_ICEX_USR "icexusr" ++#define AUFS_BRATTR_ICEX_OTH "icexoth" ++#define AUFS_BRRATTR_WH "wh" ++#define AUFS_BRWATTR_NLWH "nolwh" ++#define AUFS_BRWATTR_MOO "moo" ++ ++#define AuBrPerm_RW 1 /* writable, hardlinkable wh */ ++#define AuBrPerm_RO (1 << 1) /* readonly */ ++#define AuBrPerm_RR (1 << 2) /* natively readonly */ ++#define AuBrPerm_Mask (AuBrPerm_RW | AuBrPerm_RO | AuBrPerm_RR) ++ ++#define AuBrAttr_COO_REG (1 << 3) /* copy-up on open */ ++#define AuBrAttr_COO_ALL (1 << 4) ++#define AuBrAttr_COO_Mask (AuBrAttr_COO_REG | AuBrAttr_COO_ALL) ++ ++#define AuBrAttr_FHSM (1 << 5) /* file-based hsm */ ++#define AuBrAttr_UNPIN (1 << 6) /* rename-able top dir of ++ branch. meaningless since ++ linux-3.18-rc1 */ ++ ++/* ignore error in copying XATTR */ ++#define AuBrAttr_ICEX_SEC (1 << 7) ++#define AuBrAttr_ICEX_SYS (1 << 8) ++#define AuBrAttr_ICEX_TR (1 << 9) ++#define AuBrAttr_ICEX_USR (1 << 10) ++#define AuBrAttr_ICEX_OTH (1 << 11) ++#define AuBrAttr_ICEX (AuBrAttr_ICEX_SEC \ ++ | AuBrAttr_ICEX_SYS \ ++ | AuBrAttr_ICEX_TR \ ++ | AuBrAttr_ICEX_USR \ ++ | AuBrAttr_ICEX_OTH) ++ ++#define AuBrRAttr_WH (1 << 12) /* whiteout-able */ ++#define AuBrRAttr_Mask AuBrRAttr_WH ++ ++#define AuBrWAttr_NoLinkWH (1 << 13) /* un-hardlinkable whiteouts */ ++#define AuBrWAttr_MOO (1 << 14) /* move-up on open */ ++#define AuBrWAttr_Mask (AuBrWAttr_NoLinkWH | AuBrWAttr_MOO) ++ ++#define AuBrAttr_CMOO_Mask (AuBrAttr_COO_Mask | AuBrWAttr_MOO) ++ ++/* #warning test userspace */ ++#ifdef __KERNEL__ ++#ifndef CONFIG_AUFS_FHSM ++#undef AuBrAttr_FHSM ++#define AuBrAttr_FHSM 0 ++#endif ++#ifndef CONFIG_AUFS_XATTR ++#undef AuBrAttr_ICEX ++#define AuBrAttr_ICEX 0 ++#undef AuBrAttr_ICEX_SEC ++#define AuBrAttr_ICEX_SEC 0 ++#undef AuBrAttr_ICEX_SYS ++#define AuBrAttr_ICEX_SYS 0 ++#undef AuBrAttr_ICEX_TR ++#define AuBrAttr_ICEX_TR 0 ++#undef AuBrAttr_ICEX_USR ++#define AuBrAttr_ICEX_USR 0 ++#undef AuBrAttr_ICEX_OTH ++#define AuBrAttr_ICEX_OTH 0 ++#endif ++#endif ++ ++/* the longest combination */ ++/* AUFS_BRATTR_ICEX and AUFS_BRATTR_ICEX_TR don't affect here */ ++#define AuBrPermStrSz sizeof(AUFS_BRPERM_RW \ ++ "+" AUFS_BRATTR_COO_REG \ ++ "+" AUFS_BRATTR_FHSM \ ++ "+" AUFS_BRATTR_UNPIN \ ++ "+" AUFS_BRATTR_ICEX_SEC \ ++ "+" AUFS_BRATTR_ICEX_SYS \ ++ "+" AUFS_BRATTR_ICEX_USR \ ++ "+" AUFS_BRATTR_ICEX_OTH \ ++ "+" AUFS_BRWATTR_NLWH) ++ ++typedef struct { ++ char a[AuBrPermStrSz]; ++} au_br_perm_str_t; ++ ++static inline int au_br_writable(int brperm) ++{ ++ return brperm & AuBrPerm_RW; ++} ++ ++static inline int au_br_whable(int brperm) ++{ ++ return brperm & (AuBrPerm_RW | AuBrRAttr_WH); ++} ++ ++static inline int au_br_wh_linkable(int brperm) ++{ ++ return !(brperm & AuBrWAttr_NoLinkWH); ++} ++ ++static inline int au_br_cmoo(int brperm) ++{ ++ return brperm & AuBrAttr_CMOO_Mask; ++} ++ ++static inline int au_br_fhsm(int brperm) ++{ ++ return brperm & AuBrAttr_FHSM; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ioctl */ ++enum { ++ /* readdir in userspace */ ++ AuCtl_RDU, ++ AuCtl_RDU_INO, ++ ++ AuCtl_WBR_FD, /* pathconf wrapper */ ++ AuCtl_IBUSY, /* busy inode */ ++ AuCtl_MVDOWN, /* move-down */ ++ AuCtl_BR, /* info about branches */ ++ AuCtl_FHSM_FD /* connection for fhsm */ ++}; ++ ++/* borrowed from linux/include/linux/kernel.h */ ++#ifndef ALIGN ++#define ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a)-1) ++#define __ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) ++#endif ++ ++/* borrowed from linux/include/linux/compiler-gcc3.h */ ++#ifndef __aligned ++#define __aligned(x) __attribute__((aligned(x))) ++#endif ++ ++#ifdef __KERNEL__ ++#ifndef __packed ++#define __packed __attribute__((packed)) ++#endif ++#endif ++ ++struct au_rdu_cookie { ++ uint64_t h_pos; ++ int16_t bindex; ++ uint8_t flags; ++ uint8_t pad; ++ uint32_t generation; ++} __aligned(8); ++ ++struct au_rdu_ent { ++ uint64_t ino; ++ int16_t bindex; ++ uint8_t type; ++ uint8_t nlen; ++ uint8_t wh; ++ char name[0]; ++} __aligned(8); ++ ++static inline int au_rdu_len(int nlen) ++{ ++ /* include the terminating NULL */ ++ return ALIGN(sizeof(struct au_rdu_ent) + nlen + 1, ++ sizeof(uint64_t)); ++} ++ ++union au_rdu_ent_ul { ++ struct au_rdu_ent __user *e; ++ uint64_t ul; ++}; ++ ++enum { ++ AufsCtlRduV_SZ, ++ AufsCtlRduV_End ++}; ++ ++struct aufs_rdu { ++ /* input */ ++ union { ++ uint64_t sz; /* AuCtl_RDU */ ++ uint64_t nent; /* AuCtl_RDU_INO */ ++ }; ++ union au_rdu_ent_ul ent; ++ uint16_t verify[AufsCtlRduV_End]; ++ ++ /* input/output */ ++ uint32_t blk; ++ ++ /* output */ ++ union au_rdu_ent_ul tail; ++ /* number of entries which were added in a single call */ ++ uint64_t rent; ++ uint8_t full; ++ uint8_t shwh; ++ ++ struct au_rdu_cookie cookie; ++} __aligned(8); ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct aufs_wbr_fd { ++ uint32_t oflags; ++ int16_t brid; ++} __aligned(8); ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct aufs_ibusy { ++ uint64_t ino, h_ino; ++ int16_t bindex; ++} __aligned(8); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* error code for move-down */ ++/* the actual message strings are implemented in aufs-util.git */ ++enum { ++ EAU_MVDOWN_OPAQUE = 1, ++ EAU_MVDOWN_WHITEOUT, ++ EAU_MVDOWN_UPPER, ++ EAU_MVDOWN_BOTTOM, ++ EAU_MVDOWN_NOUPPER, ++ EAU_MVDOWN_NOLOWERBR, ++ EAU_Last ++}; ++ ++/* flags for move-down */ ++#define AUFS_MVDOWN_DMSG 1 ++#define AUFS_MVDOWN_OWLOWER (1 << 1) /* overwrite lower */ ++#define AUFS_MVDOWN_KUPPER (1 << 2) /* keep upper */ ++#define AUFS_MVDOWN_ROLOWER (1 << 3) /* do even if lower is RO */ ++#define AUFS_MVDOWN_ROLOWER_R (1 << 4) /* did on lower RO */ ++#define AUFS_MVDOWN_ROUPPER (1 << 5) /* do even if upper is RO */ ++#define AUFS_MVDOWN_ROUPPER_R (1 << 6) /* did on upper RO */ ++#define AUFS_MVDOWN_BRID_UPPER (1 << 7) /* upper brid */ ++#define AUFS_MVDOWN_BRID_LOWER (1 << 8) /* lower brid */ ++#define AUFS_MVDOWN_FHSM_LOWER (1 << 9) /* find fhsm attr for lower */ ++#define AUFS_MVDOWN_STFS (1 << 10) /* req. stfs */ ++#define AUFS_MVDOWN_STFS_FAILED (1 << 11) /* output: stfs is unusable */ ++#define AUFS_MVDOWN_BOTTOM (1 << 12) /* output: no more lowers */ ++ ++/* index for move-down */ ++enum { ++ AUFS_MVDOWN_UPPER, ++ AUFS_MVDOWN_LOWER, ++ AUFS_MVDOWN_NARRAY ++}; ++ ++/* ++ * additional info of move-down ++ * number of free blocks and inodes. ++ * subset of struct kstatfs, but smaller and always 64bit. ++ */ ++struct aufs_stfs { ++ uint64_t f_blocks; ++ uint64_t f_bavail; ++ uint64_t f_files; ++ uint64_t f_ffree; ++}; ++ ++struct aufs_stbr { ++ int16_t brid; /* optional input */ ++ int16_t bindex; /* output */ ++ struct aufs_stfs stfs; /* output when AUFS_MVDOWN_STFS set */ ++} __aligned(8); ++ ++struct aufs_mvdown { ++ uint32_t flags; /* input/output */ ++ struct aufs_stbr stbr[AUFS_MVDOWN_NARRAY]; /* input/output */ ++ int8_t au_errno; /* output */ ++} __aligned(8); ++ ++/* ---------------------------------------------------------------------- */ ++ ++union aufs_brinfo { ++ /* PATH_MAX may differ between kernel-space and user-space */ ++ char _spacer[4096]; ++ struct { ++ int16_t id; ++ int perm; ++ char path[0]; ++ }; ++} __aligned(8); ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define AuCtlType 'A' ++#define AUFS_CTL_RDU _IOWR(AuCtlType, AuCtl_RDU, struct aufs_rdu) ++#define AUFS_CTL_RDU_INO _IOWR(AuCtlType, AuCtl_RDU_INO, struct aufs_rdu) ++#define AUFS_CTL_WBR_FD _IOW(AuCtlType, AuCtl_WBR_FD, \ ++ struct aufs_wbr_fd) ++#define AUFS_CTL_IBUSY _IOWR(AuCtlType, AuCtl_IBUSY, struct aufs_ibusy) ++#define AUFS_CTL_MVDOWN _IOWR(AuCtlType, AuCtl_MVDOWN, \ ++ struct aufs_mvdown) ++#define AUFS_CTL_BRINFO _IOW(AuCtlType, AuCtl_BR, union aufs_brinfo) ++#define AUFS_CTL_FHSM_FD _IOW(AuCtlType, AuCtl_FHSM_FD, int) ++ ++#endif /* __AUFS_TYPE_H__ */ +diff --git a/kernel/fork.c b/kernel/fork.c +index d277e83..683b8a2 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -475,7 +475,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) + struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; + +- get_file(file); ++ vma_get_file(tmp); + if (tmp->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + i_mmap_lock_write(mapping); +diff --git a/kernel/task_work.c b/kernel/task_work.c +index 53fa971..bce3211 100644 +--- a/kernel/task_work.c ++++ b/kernel/task_work.c +@@ -118,3 +118,4 @@ void task_work_run(void) + } while (work); + } + } ++EXPORT_SYMBOL_GPL(task_work_run); +diff --git a/mm/Makefile b/mm/Makefile +index deb467e..0f6ae63 100644 +--- a/mm/Makefile ++++ b/mm/Makefile +@@ -37,7 +37,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ + mm_init.o mmu_context.o percpu.o slab_common.o \ + compaction.o vmacache.o \ + interval_tree.o list_lru.o workingset.o \ +- debug.o $(mmu-y) ++ prfile.o debug.o $(mmu-y) + + obj-y += init-mm.o + +diff --git a/mm/filemap.c b/mm/filemap.c +index f2479af..31f4b0d 100644 +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -2211,7 +2211,7 @@ int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) + int ret = VM_FAULT_LOCKED; + + sb_start_pagefault(inode->i_sb); +- file_update_time(vma->vm_file); ++ vma_file_update_time(vma); + lock_page(page); + if (page->mapping != inode->i_mapping) { + unlock_page(page); +diff --git a/mm/memory.c b/mm/memory.c +index 07493e3..dc696bc 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -2098,7 +2098,7 @@ static inline int wp_page_reuse(struct mm_struct *mm, + } + + if (!page_mkwrite) +- file_update_time(vma->vm_file); ++ vma_file_update_time(vma); + } + + return VM_FAULT_WRITE; +diff --git a/mm/mmap.c b/mm/mmap.c +index bd2e1a53..7328b74 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -166,7 +166,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (vma->vm_file) +- fput(vma->vm_file); ++ vma_fput(vma); + mpol_put(vma_policy(vma)); + kmem_cache_free(vm_area_cachep, vma); + return next; +@@ -785,7 +785,7 @@ again: remove_next = 1 + (end > next->vm_end); + if (remove_next) { + if (file) { + uprobe_munmap(next, next->vm_start, next->vm_end); +- fput(file); ++ vma_fput(vma); + } + if (next->anon_vma) + anon_vma_merge(vma, next); +@@ -1566,8 +1566,8 @@ out: + return addr; + + unmap_and_free_vma: ++ vma_fput(vma); + vma->vm_file = NULL; +- fput(file); + + /* Undo any partial mapping done by a device driver. */ + unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); +@@ -2362,7 +2362,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + goto out_free_mpol; + + if (new->vm_file) +- get_file(new->vm_file); ++ vma_get_file(new); + + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); +@@ -2381,7 +2381,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + if (new->vm_ops && new->vm_ops->close) + new->vm_ops->close(new); + if (new->vm_file) +- fput(new->vm_file); ++ vma_fput(new); + unlink_anon_vmas(new); + out_free_mpol: + mpol_put(vma_policy(new)); +@@ -2523,7 +2523,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, + struct vm_area_struct *vma; + unsigned long populate = 0; + unsigned long ret = -EINVAL; +- struct file *file; ++ struct file *file, *prfile; + + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n", + current->comm, current->pid); +@@ -2590,10 +2590,27 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, + } + } + +- file = get_file(vma->vm_file); ++ vma_get_file(vma); ++ file = vma->vm_file; ++ prfile = vma->vm_prfile; + ret = do_mmap_pgoff(vma->vm_file, start, size, + prot, flags, pgoff, &populate); ++ if (!IS_ERR_VALUE(ret) && file && prfile) { ++ struct vm_area_struct *new_vma; ++ ++ new_vma = find_vma(mm, ret); ++ if (!new_vma->vm_prfile) ++ new_vma->vm_prfile = prfile; ++ if (new_vma != vma) ++ get_file(prfile); ++ } ++ /* ++ * two fput()s instead of vma_fput(vma), ++ * coz vma may not be available anymore. ++ */ + fput(file); ++ if (prfile) ++ fput(prfile); + out: + up_write(&mm->mmap_sem); + if (populate) +@@ -2864,7 +2881,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, + if (anon_vma_clone(new_vma, vma)) + goto out_free_mempol; + if (new_vma->vm_file) +- get_file(new_vma->vm_file); ++ vma_get_file(new_vma); + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + vma_link(mm, new_vma, prev, rb_link, rb_parent); +diff --git a/mm/nommu.c b/mm/nommu.c +index c8bd59a..4cfc2fc 100644 +--- a/mm/nommu.c ++++ b/mm/nommu.c +@@ -644,7 +644,7 @@ static void __put_nommu_region(struct vm_region *region) + up_write(&nommu_region_sem); + + if (region->vm_file) +- fput(region->vm_file); ++ vmr_fput(region); + + /* IO memory and memory shared directly out of the pagecache + * from ramfs/tmpfs mustn't be released here */ +@@ -802,7 +802,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (vma->vm_file) +- fput(vma->vm_file); ++ vma_fput(vma); + put_nommu_region(vma->vm_region); + kmem_cache_free(vm_area_cachep, vma); + } +@@ -1328,7 +1328,7 @@ unsigned long do_mmap(struct file *file, + goto error_just_free; + } + } +- fput(region->vm_file); ++ vmr_fput(region); + kmem_cache_free(vm_region_jar, region); + region = pregion; + result = start; +@@ -1403,10 +1403,10 @@ error_just_free: + up_write(&nommu_region_sem); + error: + if (region->vm_file) +- fput(region->vm_file); ++ vmr_fput(region); + kmem_cache_free(vm_region_jar, region); + if (vma->vm_file) +- fput(vma->vm_file); ++ vma_fput(vma); + kmem_cache_free(vm_area_cachep, vma); + return ret; + +diff --git a/mm/prfile.c b/mm/prfile.c +new file mode 100644 +index 0000000..b323b8a +--- /dev/null ++++ b/mm/prfile.c +@@ -0,0 +1,86 @@ ++/* ++ * Mainly for aufs which mmap(2) diffrent file and wants to print different path ++ * in /proc/PID/maps. ++ * Call these functions via macros defined in linux/mm.h. ++ * ++ * See Documentation/filesystems/aufs/design/06mmap.txt ++ * ++ * Copyright (c) 2014 Junjro R. Okajima ++ * Copyright (c) 2014 Ian Campbell ++ */ ++ ++#include ++#include ++#include ++ ++/* #define PRFILE_TRACE */ ++static inline void prfile_trace(struct file *f, struct file *pr, ++ const char func[], int line, const char func2[]) ++{ ++#ifdef PRFILE_TRACE ++ if (pr) ++ pr_info("%s:%d: %s, %s\n", func, line, func2, ++ f ? (char *)f->f_path.dentry->d_name.name : "(null)"); ++#endif ++} ++ ++void vma_do_file_update_time(struct vm_area_struct *vma, const char func[], ++ int line) ++{ ++ struct file *f = vma->vm_file, *pr = vma->vm_prfile; ++ ++ prfile_trace(f, pr, func, line, __func__); ++ file_update_time(f); ++ if (f && pr) ++ file_update_time(pr); ++} ++ ++struct file *vma_do_pr_or_file(struct vm_area_struct *vma, const char func[], ++ int line) ++{ ++ struct file *f = vma->vm_file, *pr = vma->vm_prfile; ++ ++ prfile_trace(f, pr, func, line, __func__); ++ return (f && pr) ? pr : f; ++} ++ ++void vma_do_get_file(struct vm_area_struct *vma, const char func[], int line) ++{ ++ struct file *f = vma->vm_file, *pr = vma->vm_prfile; ++ ++ prfile_trace(f, pr, func, line, __func__); ++ get_file(f); ++ if (f && pr) ++ get_file(pr); ++} ++ ++void vma_do_fput(struct vm_area_struct *vma, const char func[], int line) ++{ ++ struct file *f = vma->vm_file, *pr = vma->vm_prfile; ++ ++ prfile_trace(f, pr, func, line, __func__); ++ fput(f); ++ if (f && pr) ++ fput(pr); ++} ++ ++#ifndef CONFIG_MMU ++struct file *vmr_do_pr_or_file(struct vm_region *region, const char func[], ++ int line) ++{ ++ struct file *f = region->vm_file, *pr = region->vm_prfile; ++ ++ prfile_trace(f, pr, func, line, __func__); ++ return (f && pr) ? pr : f; ++} ++ ++void vmr_do_fput(struct vm_region *region, const char func[], int line) ++{ ++ struct file *f = region->vm_file, *pr = region->vm_prfile; ++ ++ prfile_trace(f, pr, func, line, __func__); ++ fput(f); ++ if (f && pr) ++ fput(pr); ++} ++#endif /* !CONFIG_MMU */ +diff --git a/security/commoncap.c b/security/commoncap.c +index 48071ed..50a1a40 100644 +--- a/security/commoncap.c ++++ b/security/commoncap.c +@@ -1058,12 +1058,14 @@ int cap_mmap_addr(unsigned long addr) + } + return ret; + } ++EXPORT_SYMBOL_GPL(cap_mmap_addr); + + int cap_mmap_file(struct file *file, unsigned long reqprot, + unsigned long prot, unsigned long flags) + { + return 0; + } ++EXPORT_SYMBOL_GPL(cap_mmap_file); + + #ifdef CONFIG_SECURITY + +diff --git a/security/device_cgroup.c b/security/device_cgroup.c +index 03c1652..f88c84b 100644 +--- a/security/device_cgroup.c ++++ b/security/device_cgroup.c +@@ -7,6 +7,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -849,6 +850,7 @@ int __devcgroup_inode_permission(struct inode *inode, int mask) + return __devcgroup_check_permission(type, imajor(inode), iminor(inode), + access); + } ++EXPORT_SYMBOL_GPL(__devcgroup_inode_permission); + + int devcgroup_inode_mknod(int mode, dev_t dev) + { +diff --git a/security/security.c b/security/security.c +index 3644b03..593879b 100644 +--- a/security/security.c ++++ b/security/security.c +@@ -433,6 +433,7 @@ int security_path_rmdir(struct path *dir, struct dentry *dentry) + return 0; + return call_int_hook(path_rmdir, 0, dir, dentry); + } ++EXPORT_SYMBOL_GPL(security_path_rmdir); + + int security_path_unlink(struct path *dir, struct dentry *dentry) + { +@@ -449,6 +450,7 @@ int security_path_symlink(struct path *dir, struct dentry *dentry, + return 0; + return call_int_hook(path_symlink, 0, dir, dentry, old_name); + } ++EXPORT_SYMBOL_GPL(security_path_symlink); + + int security_path_link(struct dentry *old_dentry, struct path *new_dir, + struct dentry *new_dentry) +@@ -457,6 +459,7 @@ int security_path_link(struct dentry *old_dentry, struct path *new_dir, + return 0; + return call_int_hook(path_link, 0, old_dentry, new_dir, new_dentry); + } ++EXPORT_SYMBOL_GPL(security_path_link); + + int security_path_rename(struct path *old_dir, struct dentry *old_dentry, + struct path *new_dir, struct dentry *new_dentry, +@@ -484,6 +487,7 @@ int security_path_truncate(struct path *path) + return 0; + return call_int_hook(path_truncate, 0, path); + } ++EXPORT_SYMBOL_GPL(security_path_truncate); + + int security_path_chmod(struct path *path, umode_t mode) + { +@@ -491,6 +495,7 @@ int security_path_chmod(struct path *path, umode_t mode) + return 0; + return call_int_hook(path_chmod, 0, path, mode); + } ++EXPORT_SYMBOL_GPL(security_path_chmod); + + int security_path_chown(struct path *path, kuid_t uid, kgid_t gid) + { +@@ -498,6 +503,7 @@ int security_path_chown(struct path *path, kuid_t uid, kgid_t gid) + return 0; + return call_int_hook(path_chown, 0, path, uid, gid); + } ++EXPORT_SYMBOL_GPL(security_path_chown); + + int security_path_chroot(struct path *path) + { +@@ -583,6 +589,7 @@ int security_inode_readlink(struct dentry *dentry) + return 0; + return call_int_hook(inode_readlink, 0, dentry); + } ++EXPORT_SYMBOL_GPL(security_inode_readlink); + + int security_inode_follow_link(struct dentry *dentry, struct inode *inode, + bool rcu) +@@ -598,6 +605,7 @@ int security_inode_permission(struct inode *inode, int mask) + return 0; + return call_int_hook(inode_permission, 0, inode, mask); + } ++EXPORT_SYMBOL_GPL(security_inode_permission); + + int security_inode_setattr(struct dentry *dentry, struct iattr *attr) + { +@@ -736,6 +744,7 @@ int security_file_permission(struct file *file, int mask) + + return fsnotify_perm(file, mask); + } ++EXPORT_SYMBOL_GPL(security_file_permission); + + int security_file_alloc(struct file *file) + { +@@ -795,6 +804,7 @@ int security_mmap_file(struct file *file, unsigned long prot, + return ret; + return ima_file_mmap(file, prot); + } ++EXPORT_SYMBOL_GPL(security_mmap_file); + + int security_mmap_addr(unsigned long addr) + { diff --git a/hp-wmi-rfkill-fix.patch b/hp-wmi-rfkill-fix.patch new file mode 100644 index 0000000..2f3d09c --- /dev/null +++ b/hp-wmi-rfkill-fix.patch @@ -0,0 +1,31 @@ +This patch fixes the problem encountered on many HP laptops: in some cases, +WiFi becomes hard-blocked and cannot be unblocked since then. Seen that on +HP 6730b and others. + +This is https://bugzilla.kernel.org/show_bug.cgi?id=69131 + +diff --git a/drivers/platform/x86/hp-wmi.c b/drivers/platform/x86/hp-wmi.c +index 0669731..37000f0 100644 +--- a/drivers/platform/x86/hp-wmi.c ++++ b/drivers/platform/x86/hp-wmi.c +@@ -714,6 +714,11 @@ static int __init hp_wmi_rfkill_setup(struct platform_device *device) + if (err) + return err; + ++ err = hp_wmi_perform_query(HPWMI_WIRELESS_QUERY, 1, &wireless, ++ sizeof(wireless), 0); ++ if (err) ++ return err; ++ + if (wireless & 0x1) { + wifi_rfkill = rfkill_alloc("hp-wifi", &device->dev, + RFKILL_TYPE_WLAN, +@@ -901,7 +906,7 @@ static int __init hp_wmi_bios_setup(struct platform_device *device) + wwan_rfkill = NULL; + rfkill2_count = 0; + +- if (hp_wmi_bios_2009_later() || hp_wmi_rfkill_setup(device)) ++ if (hp_wmi_rfkill_setup(device)) + hp_wmi_rfkill2_setup(device); + + err = device_create_file(&device->dev, &dev_attr_display); diff --git a/include-kbuild-export-pci_ids.patch b/include-kbuild-export-pci_ids.patch new file mode 100644 index 0000000..ebcf983 --- /dev/null +++ b/include-kbuild-export-pci_ids.patch @@ -0,0 +1,19 @@ +From Thierry Vignaud (Mandriva) + +We now lacks /usr/include/linux/pci_ids.h which break ldetect build... +Can you readd it please? +Thanks +--- + include/linux/Kbuild | 1 + + 1 file changed, 1 insertion(+) + +--- linux/include/uapi/linux/Kbuild.include-kbuild-export-pci_ids.orig ++++ linux/include/uapi/linux/Kbuild +@@ -277,6 +277,7 @@ header-y += param.h + header-y += parport.h + header-y += patchkey.h + header-y += pci.h ++header-y += pci_ids.h + header-y += pci_regs.h + header-y += perf_event.h + header-y += personality.h diff --git a/inotify-increase-max-user-watches.patch b/inotify-increase-max-user-watches.patch new file mode 100644 index 0000000..18c7aaa --- /dev/null +++ b/inotify-increase-max-user-watches.patch @@ -0,0 +1,20 @@ +http://bugs.rosalinux.ru/show_bug.cgi?id=4791 + +It seems, many applications need max_user_watches to be at least 32768. +Let us set the default value of this parameter accordingly. + +Signed-off-by: Eugene A. Shatokhin + +diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c +index 78a2ca3..ecd1120 100644 +--- a/fs/notify/inotify/inotify_user.c ++++ b/fs/notify/inotify/inotify_user.c +@@ -809,7 +809,7 @@ static int __init inotify_user_setup(void) + + inotify_max_queued_events = 16384; + inotify_max_user_instances = 128; +- inotify_max_user_watches = 8192; ++ inotify_max_user_watches = 32768; + + return 0; + } diff --git a/kernel.spec b/kernel.spec index 2089a99..b9b81a1 100644 --- a/kernel.spec +++ b/kernel.spec @@ -22,11 +22,6 @@ # version defines %define kversion %{kernelversion}.%{patchlevel}.%{sublevel} %define kverrel %{kversion}-%{rpmrel} - -# The revision of the patchset and the configs to use. Usually, the name of -# the branch in https://abf.rosalinux.ru/soft/kernel-patches-and-configs/ -# git repository. -%define revision v%{kernelversion}.%{patchlevel}.x %define tar_ver %{kernelversion}.%{patchlevel} %ifarch %{ix86} @@ -153,18 +148,59 @@ Source52: cpupower-start.sh Source53: cpupower.path Source80: kernel.rpmlintrc +#################################################################### -# Our patch tarball, -# see https://abf.rosalinux.ru/soft/kernel-patches-and-configs/ -Source100: https://abf.io/soft/kernel-patches-and-configs/archive/kernel-patches-and-configs-%{revision}.tar.gz -# #################################################################### -# # Patches # The patch to make kernel x.y.z from x.y.0. Patch1: ftp://ftp.kernel.org/pub/linux/kernel/v%{kernelversion}.x/patch-%{kversion}.xz Source10: ftp://ftp.kernel.org/pub/linux/kernel/v%{kernelversion}.x/patch-%{kversion}.sign +# ROSA-specific patches + +# Adds explicit linking of the Perf Python extension with libdl, thus fixing +# the build. +Patch100: perf-python-ext-link-with-dl.patch + +# Perf docs are built after all the kernels. To validate the xml files +# generated during that process, xmlto tries to get DTD files from the Net. +# If it fails, the whole build fails, which is unfortunate. Let us avoid +# this. +Patch101: perf-xmlto-skip-validation.patch + +# Bug http://bugs.rosalinux.ru/show_bug.cgi?id=4791 +Patch102: inotify-increase-max-user-watches.patch + +# Export pci_ids.h to user space, needed by ldetect +# TODO: is it really needed now? +Patch103: include-kbuild-export-pci_ids.patch + +# http://bugs.rosalinux.ru/show_bug.cgi?id=6235 +# http://bugs.rosalinux.ru/show_bug.cgi?id=6459 +Patch104: audit-make-it-less-verbose.patch + +# http://bugs.rosalinux.ru/show_bug.cgi?id=5649#c6 +Patch105: drm-cirrus-Use-16bpp-as-default.patch + +# Increase vmalloc area, https://bugs.mageia.org/show_bug.cgi?id=904 +Patch106: x86-increase-default-minimum-vmalloc-area-by-64MB-to-192MB.patch + +# disable floppy autoloading (mga #4696) +Patch107: block-floppy-disable-pnp-modalias.patch + +# prefer ata over ide drivers +Patch108: ata-prefer-ata-drivers-over-ide-drivers-when-both-are-built.patch + +# AUFS from http://aufs.sourceforge.net/ +Patch109: fs-aufs4.patch + +# https://bugzilla.kernel.org/show_bug.cgi?id=69131 +Patch110: hp-wmi-rfkill-fix.patch + +# BFQ IO scheduler, http://algogroup.unimore.it/people/paolo/disk_sched/ +Patch111: 0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.5.0.patch +Patch112: 0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.5.0.patch +Patch113: 0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for.patch #################################################################### # Defines for the things that are needed for all the kernels @@ -540,16 +576,9 @@ should use the 'kernel-devel' package instead. # %prep %setup -q -n %top_dir_name -c -%setup -q -n %top_dir_name -D -T -a100 - -%define patches_dir ../kernel-patches-and-configs-%{revision}/ - cd %src_dir -%patch1 -p1 - -%{patches_dir}/scripts/apply_patches -%{patches_dir}/scripts/apply_patches-NRJ +%apply_patches # # Setup Begin diff --git a/perf-python-ext-link-with-dl.patch b/perf-python-ext-link-with-dl.patch new file mode 100644 index 0000000..3d46566 --- /dev/null +++ b/perf-python-ext-link-with-dl.patch @@ -0,0 +1,22 @@ +diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py +index c868098..61c98a9 100644 +--- a/tools/perf/util/setup.py ++++ b/tools/perf/util/setup.py +@@ -22,6 +22,9 @@ cflags = getenv('CFLAGS', '').split() + # switch off several checks (need to be at the end of cflags list) + cflags += ['-fno-strict-aliasing', '-Wno-write-strings', '-Wno-unused-parameter' ] + ++libs = getenv('LIBS', '').split() ++libs += ['dl'] ++ + src_perf = getenv('srctree') + '/tools/perf' + build_lib = getenv('PYTHON_EXTBUILD_LIB') + build_tmp = getenv('PYTHON_EXTBUILD_TMP') +@@ -39,6 +42,7 @@ perf = Extension('perf', + include_dirs = ['util/include'], + extra_compile_args = cflags, + extra_objects = [libtraceevent, libapikfs], ++ libraries = libs, + ) + + setup(name='perf', diff --git a/perf-xmlto-skip-validation.patch b/perf-xmlto-skip-validation.patch new file mode 100644 index 0000000..faa6a0e --- /dev/null +++ b/perf-xmlto-skip-validation.patch @@ -0,0 +1,13 @@ +diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile +index 3ba1c0b..2f5070f 100644 +--- a/tools/perf/Documentation/Makefile ++++ b/tools/perf/Documentation/Makefile +@@ -49,7 +49,7 @@ man7dir=$(mandir)/man7 + ASCIIDOC=asciidoc + ASCIIDOC_EXTRA = --unsafe + MANPAGE_XSL = manpage-normal.xsl +-XMLTO_EXTRA = ++XMLTO_EXTRA = --skip-validation + INSTALL?=install + RM ?= rm -f + DOC_REF = origin/man diff --git a/x86-increase-default-minimum-vmalloc-area-by-64MB-to-192MB.patch b/x86-increase-default-minimum-vmalloc-area-by-64MB-to-192MB.patch new file mode 100644 index 0000000..05d57ee --- /dev/null +++ b/x86-increase-default-minimum-vmalloc-area-by-64MB-to-192MB.patch @@ -0,0 +1,28 @@ +From 2db91ae7c0862bc83c1fe8b8bd14b8de3e2c02b8 Mon Sep 17 00:00:00 2001 +From: Anssi Hannula +Date: Sat, 30 Apr 2011 17:09:04 +0300 +Subject: [PATCH] x86: increase default minimum vmalloc area by 64MB to 192MB + +This fixes issues like https://bugs.mageia.org/show_bug.cgi?id=904. + +Signed-off-by: Anssi Hannula +--- + arch/x86/mm/pgtable_32.c | 2 +- + 1 files changed, 1 insertions(+), 1 deletions(-) + +diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c +index cac7184..5801fb7 100644 +--- a/arch/x86/mm/pgtable_32.c ++++ b/arch/x86/mm/pgtable_32.c +@@ -19,7 +19,7 @@ + #include + #include + +-unsigned int __VMALLOC_RESERVE = 128 << 20; ++unsigned int __VMALLOC_RESERVE = 192 << 20; + + /* + * Associate a virtual page frame with a given physical page frame +-- +1.7.3 +