linux/block/blk-cgroup.h

456 lines
14 KiB
C
Raw Normal View History

#ifndef _BLK_CGROUP_H
#define _BLK_CGROUP_H
/*
* Common Block IO controller cgroup interface
*
* Based on ideas and code from CFQ, CFS and BFQ:
* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
*
* Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
* Paolo Valente <paolo.valente@unimore.it>
*
* Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
* Nauman Rafique <nauman@google.com>
*/
#include <linux/cgroup.h>
#include <linux/u64_stats_sync.h>
enum blkio_policy_id {
BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */
BLKIO_POLICY_THROTL, /* Throttling */
BLKIO_NR_POLICIES,
};
/* Max limits for throttle policy */
#define THROTL_IOPS_MAX UINT_MAX
#ifdef CONFIG_BLK_CGROUP
2010-04-09 10:31:19 +04:00
enum stat_type {
/* Number of IOs merged */
BLKIO_STAT_MERGED,
2010-04-09 10:31:19 +04:00
/* Total time spent (in ns) between request dispatch to the driver and
* request completion for IOs doen by this cgroup. This may not be
* accurate when NCQ is turned on. */
BLKIO_STAT_SERVICE_TIME,
2010-04-09 10:31:19 +04:00
/* Total time spent waiting in scheduler queue in ns */
BLKIO_STAT_WAIT_TIME,
/* Number of IOs queued up */
BLKIO_STAT_QUEUED,
2010-04-09 10:31:19 +04:00
/* All the single valued stats go below this */
BLKIO_STAT_TIME,
#ifdef CONFIG_DEBUG_BLK_CGROUP
/* Time not charged to this cgroup */
BLKIO_STAT_UNACCOUNTED_TIME,
BLKIO_STAT_AVG_QUEUE_SIZE,
blkio: Add more debug-only per-cgroup stats 1) group_wait_time - This is the amount of time the cgroup had to wait to get a timeslice for one of its queues from when it became busy, i.e., went from 0 to 1 request queued. This is different from the io_wait_time which is the cumulative total of the amount of time spent by each IO in that cgroup waiting in the scheduler queue. This stat is a great way to find out any jobs in the fleet that are being starved or waiting for longer than what is expected (due to an IO controller bug or any other issue). 2) empty_time - This is the amount of time a cgroup spends w/o any pending requests. This stat is useful when a job does not seem to be able to use its assigned disk share by helping check if that is happening due to an IO controller bug or because the job is not submitting enough IOs. 3) idle_time - This is the amount of time spent by the IO scheduler idling for a given cgroup in anticipation of a better request than the exising ones from other queues/cgroups. All these stats are recorded using start and stop events. When reading these stats, we do not add the delta between the current time and the last start time if we're between the start and stop events. We avoid doing this to make sure that these numbers are always monotonically increasing when read. Since we're using sched_clock() which may use the tsc as its source, it may induce some inconsistency (due to tsc resync across cpus) if we included the current delta. Signed-off-by: Divyesh Shah<dpshah@google.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2010-04-09 08:15:35 +04:00
BLKIO_STAT_IDLE_TIME,
BLKIO_STAT_EMPTY_TIME,
BLKIO_STAT_GROUP_WAIT_TIME,
2010-04-09 10:31:19 +04:00
BLKIO_STAT_DEQUEUE
#endif
};
/* Types lower than this live in stat_arr and have subtypes */
#define BLKIO_STAT_ARR_NR (BLKIO_STAT_QUEUED + 1)
/* Per cpu stats */
enum stat_type_cpu {
BLKIO_STAT_CPU_SECTORS,
/* Total bytes transferred */
BLKIO_STAT_CPU_SERVICE_BYTES,
/* Total IOs serviced, post merge */
BLKIO_STAT_CPU_SERVICED,
BLKIO_STAT_CPU_NR
};
2010-04-09 10:31:19 +04:00
enum stat_sub_type {
BLKIO_STAT_READ = 0,
BLKIO_STAT_WRITE,
BLKIO_STAT_SYNC,
BLKIO_STAT_ASYNC,
BLKIO_STAT_TOTAL
};
blkio: Add more debug-only per-cgroup stats 1) group_wait_time - This is the amount of time the cgroup had to wait to get a timeslice for one of its queues from when it became busy, i.e., went from 0 to 1 request queued. This is different from the io_wait_time which is the cumulative total of the amount of time spent by each IO in that cgroup waiting in the scheduler queue. This stat is a great way to find out any jobs in the fleet that are being starved or waiting for longer than what is expected (due to an IO controller bug or any other issue). 2) empty_time - This is the amount of time a cgroup spends w/o any pending requests. This stat is useful when a job does not seem to be able to use its assigned disk share by helping check if that is happening due to an IO controller bug or because the job is not submitting enough IOs. 3) idle_time - This is the amount of time spent by the IO scheduler idling for a given cgroup in anticipation of a better request than the exising ones from other queues/cgroups. All these stats are recorded using start and stop events. When reading these stats, we do not add the delta between the current time and the last start time if we're between the start and stop events. We avoid doing this to make sure that these numbers are always monotonically increasing when read. Since we're using sched_clock() which may use the tsc as its source, it may induce some inconsistency (due to tsc resync across cpus) if we included the current delta. Signed-off-by: Divyesh Shah<dpshah@google.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2010-04-09 08:15:35 +04:00
/* blkg state flags */
enum blkg_state_flags {
BLKG_waiting = 0,
BLKG_idling,
BLKG_empty,
};
/* cgroup files owned by proportional weight policy */
enum blkcg_file_name_prop {
BLKIO_PROP_weight = 1,
BLKIO_PROP_weight_device,
BLKIO_PROP_io_service_bytes,
BLKIO_PROP_io_serviced,
BLKIO_PROP_time,
BLKIO_PROP_sectors,
BLKIO_PROP_unaccounted_time,
BLKIO_PROP_io_service_time,
BLKIO_PROP_io_wait_time,
BLKIO_PROP_io_merged,
BLKIO_PROP_io_queued,
BLKIO_PROP_avg_queue_size,
BLKIO_PROP_group_wait_time,
BLKIO_PROP_idle_time,
BLKIO_PROP_empty_time,
BLKIO_PROP_dequeue,
};
/* cgroup files owned by throttle policy */
enum blkcg_file_name_throtl {
BLKIO_THROTL_read_bps_device,
BLKIO_THROTL_write_bps_device,
BLKIO_THROTL_read_iops_device,
BLKIO_THROTL_write_iops_device,
BLKIO_THROTL_io_service_bytes,
BLKIO_THROTL_io_serviced,
};
struct blkio_cgroup {
struct cgroup_subsys_state css;
unsigned int weight;
spinlock_t lock;
struct hlist_head blkg_list;
};
struct blkio_group_stats {
struct u64_stats_sync syncp;
/* total disk time and nr sectors dispatched by this group */
uint64_t time;
uint64_t stat_arr[BLKIO_STAT_ARR_NR][BLKIO_STAT_TOTAL];
#ifdef CONFIG_DEBUG_BLK_CGROUP
/* Time not charged to this cgroup */
uint64_t unaccounted_time;
/* Sum of number of IOs queued across all samples */
uint64_t avg_queue_size_sum;
/* Count of samples taken for average */
uint64_t avg_queue_size_samples;
/* How many times this group has been removed from service tree */
unsigned long dequeue;
blkio: Add more debug-only per-cgroup stats 1) group_wait_time - This is the amount of time the cgroup had to wait to get a timeslice for one of its queues from when it became busy, i.e., went from 0 to 1 request queued. This is different from the io_wait_time which is the cumulative total of the amount of time spent by each IO in that cgroup waiting in the scheduler queue. This stat is a great way to find out any jobs in the fleet that are being starved or waiting for longer than what is expected (due to an IO controller bug or any other issue). 2) empty_time - This is the amount of time a cgroup spends w/o any pending requests. This stat is useful when a job does not seem to be able to use its assigned disk share by helping check if that is happening due to an IO controller bug or because the job is not submitting enough IOs. 3) idle_time - This is the amount of time spent by the IO scheduler idling for a given cgroup in anticipation of a better request than the exising ones from other queues/cgroups. All these stats are recorded using start and stop events. When reading these stats, we do not add the delta between the current time and the last start time if we're between the start and stop events. We avoid doing this to make sure that these numbers are always monotonically increasing when read. Since we're using sched_clock() which may use the tsc as its source, it may induce some inconsistency (due to tsc resync across cpus) if we included the current delta. Signed-off-by: Divyesh Shah<dpshah@google.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2010-04-09 08:15:35 +04:00
/* Total time spent waiting for it to be assigned a timeslice. */
uint64_t group_wait_time;
/* Time spent idling for this blkio_group */
uint64_t idle_time;
/*
* Total time when we have requests queued and do not contain the
* current active queue.
*/
uint64_t empty_time;
/* fields after this shouldn't be cleared on stat reset */
uint64_t start_group_wait_time;
uint64_t start_idle_time;
blkio: Add more debug-only per-cgroup stats 1) group_wait_time - This is the amount of time the cgroup had to wait to get a timeslice for one of its queues from when it became busy, i.e., went from 0 to 1 request queued. This is different from the io_wait_time which is the cumulative total of the amount of time spent by each IO in that cgroup waiting in the scheduler queue. This stat is a great way to find out any jobs in the fleet that are being starved or waiting for longer than what is expected (due to an IO controller bug or any other issue). 2) empty_time - This is the amount of time a cgroup spends w/o any pending requests. This stat is useful when a job does not seem to be able to use its assigned disk share by helping check if that is happening due to an IO controller bug or because the job is not submitting enough IOs. 3) idle_time - This is the amount of time spent by the IO scheduler idling for a given cgroup in anticipation of a better request than the exising ones from other queues/cgroups. All these stats are recorded using start and stop events. When reading these stats, we do not add the delta between the current time and the last start time if we're between the start and stop events. We avoid doing this to make sure that these numbers are always monotonically increasing when read. Since we're using sched_clock() which may use the tsc as its source, it may induce some inconsistency (due to tsc resync across cpus) if we included the current delta. Signed-off-by: Divyesh Shah<dpshah@google.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2010-04-09 08:15:35 +04:00
uint64_t start_empty_time;
uint16_t flags;
#endif
};
#ifdef CONFIG_DEBUG_BLK_CGROUP
#define BLKG_STATS_DEBUG_CLEAR_START \
offsetof(struct blkio_group_stats, unaccounted_time)
#define BLKG_STATS_DEBUG_CLEAR_SIZE \
(offsetof(struct blkio_group_stats, start_group_wait_time) - \
BLKG_STATS_DEBUG_CLEAR_START)
#endif
/* Per cpu blkio group stats */
struct blkio_group_stats_cpu {
uint64_t sectors;
uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL];
struct u64_stats_sync syncp;
};
blkcg: don't allow or retain configuration of missing devices blkcg is very peculiar in that it allows setting and remembering configurations for non-existent devices by maintaining separate data structures for configuration. This behavior is completely out of the usual norms and outright confusing; furthermore, it uses dev_t number to match the configuration to devices, which is unpredictable to begin with and becomes completely unuseable if EXT_DEVT is fully used. It is wholely unnecessary - we already have fully functional userland mechanism to program devices being hotplugged which has full access to device identification, connection topology and filesystem information. Add a new struct blkio_group_conf which contains all blkcg configurations to blkio_group and let blkio_group, which can be created iff the associated device exists and is removed when the associated device goes away, carry all configurations. Note that, after this patch, all newly created blkg's will always have the default configuration (unlimited for throttling and blkcg's weight for propio). This patch makes blkio_policy_node meaningless but doesn't remove it. The next patch will. -v2: Updated to retry after short sleep if blkg lookup/creation failed due to the queue being temporarily bypassed as indicated by -EBUSY return. Pointed out by Vivek. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Vivek Goyal <vgoyal@redhat.com> Cc: Kay Sievers <kay.sievers@vrfy.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2012-03-06 01:15:07 +04:00
struct blkio_group_conf {
unsigned int weight;
unsigned int iops[2];
u64 bps[2];
};
/* per-blkg per-policy data */
struct blkg_policy_data {
/* the blkg this per-policy data belongs to */
struct blkio_group *blkg;
/* Configuration */
struct blkio_group_conf conf;
struct blkio_group_stats stats;
/* Per cpu stats pointer */
struct blkio_group_stats_cpu __percpu *stats_cpu;
/* pol->pdata_size bytes of private data used by policy impl */
char pdata[] __aligned(__alignof__(unsigned long long));
};
struct blkio_group {
/* Pointer to the associated request_queue */
struct request_queue *q;
blkcg: unify blkg's for blkcg policies Currently, blkg is per cgroup-queue-policy combination. This is unnatural and leads to various convolutions in partially used duplicate fields in blkg, config / stat access, and general management of blkgs. This patch make blkg's per cgroup-queue and let them serve all policies. blkgs are now created and destroyed by blkcg core proper. This will allow further consolidation of common management logic into blkcg core and API with better defined semantics and layering. As a transitional step to untangle blkg management, elvswitch and policy [de]registration, all blkgs except the root blkg are being shot down during elvswitch and bypass. This patch adds blkg_root_update() to update root blkg in place on policy change. This is hacky and racy but should be good enough as interim step until we get locking simplified and switch over to proper in-place update for all blkgs. -v2: Root blkgs need to be updated on elvswitch too and blkg_alloc() comment wasn't updated according to the function change. Fixed. Both pointed out by Vivek. -v3: v2 updated blkg_destroy_all() to invoke update_root_blkg_pd() for all policies. This freed root pd during elvswitch before the last queue finished exiting and led to oops. Directly invoke update_root_blkg_pd() only on BLKIO_POLICY_PROP from cfq_exit_queue(). This also is closer to what will be done with proper in-place blkg update. Reported by Vivek. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2012-03-06 01:15:20 +04:00
struct list_head q_node;
struct hlist_node blkcg_node;
struct blkio_cgroup *blkcg;
/* Store cgroup path */
char path[128];
/* reference count */
int refcnt;
struct blkg_policy_data *pd[BLKIO_NR_POLICIES];
blkcg: alloc per cpu stats from worker thread in a delayed manner Current per cpu stat allocation assumes GFP_KERNEL allocation flag. But in IO path there are times when we want GFP_NOIO semantics. As there is no way to pass the allocation flags to alloc_percpu(), this patch delays the allocation of stats using a worker thread. v2-> tejun suggested following changes. Changed the patch accordingly. - move alloc_node location in structure - reduce the size of names of some of the fields - Reduce the scope of locking of alloc_list_lock - Simplified stat_alloc_fn() by allocating stats for all policies in one go and then assigning these to a group. v3 -> Andrew suggested to put some comments in the code. Also raised concerns about trying to allocate infinitely in case of allocation failure. I have changed the logic to sleep for 10ms before retrying. That should take care of non-preemptible UP kernels. v4 -> Tejun had more suggestions. - drop list_for_each_entry_all() - instead of msleep() use queue_delayed_work() - Some cleanups realted to more compact coding. v5-> tejun suggested more cleanups leading to more compact code. tj: - Relocated pcpu_stats into blkio_stat_alloc_fn(). - Minor comment update. - This also fixes suspicious RCU usage warning caused by invoking cgroup_path() from blkg_alloc() without holding RCU read lock. Now that blkg_alloc() doesn't require sleepable context, RCU read lock from blkg_lookup_create() is maintained throughout blkg_alloc(). Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2012-03-08 22:53:56 +04:00
/* List of blkg waiting for per cpu stats memory to be allocated */
struct list_head alloc_node;
struct rcu_head rcu_head;
};
typedef void (blkio_init_group_fn)(struct blkio_group *blkg);
typedef void (blkio_update_group_weight_fn)(struct request_queue *q,
struct blkio_group *blkg, unsigned int weight);
typedef void (blkio_update_group_read_bps_fn)(struct request_queue *q,
struct blkio_group *blkg, u64 read_bps);
typedef void (blkio_update_group_write_bps_fn)(struct request_queue *q,
struct blkio_group *blkg, u64 write_bps);
typedef void (blkio_update_group_read_iops_fn)(struct request_queue *q,
struct blkio_group *blkg, unsigned int read_iops);
typedef void (blkio_update_group_write_iops_fn)(struct request_queue *q,
struct blkio_group *blkg, unsigned int write_iops);
struct blkio_policy_ops {
blkio_init_group_fn *blkio_init_group_fn;
blkio_update_group_weight_fn *blkio_update_group_weight_fn;
blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn;
blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn;
};
struct blkio_policy_type {
struct list_head list;
struct blkio_policy_ops ops;
enum blkio_policy_id plid;
size_t pdata_size; /* policy specific private data size */
};
extern int blkcg_init_queue(struct request_queue *q);
extern void blkcg_drain_queue(struct request_queue *q);
extern void blkcg_exit_queue(struct request_queue *q);
/* Blkio controller policy registration */
extern void blkio_policy_register(struct blkio_policy_type *);
extern void blkio_policy_unregister(struct blkio_policy_type *);
blkcg: unify blkg's for blkcg policies Currently, blkg is per cgroup-queue-policy combination. This is unnatural and leads to various convolutions in partially used duplicate fields in blkg, config / stat access, and general management of blkgs. This patch make blkg's per cgroup-queue and let them serve all policies. blkgs are now created and destroyed by blkcg core proper. This will allow further consolidation of common management logic into blkcg core and API with better defined semantics and layering. As a transitional step to untangle blkg management, elvswitch and policy [de]registration, all blkgs except the root blkg are being shot down during elvswitch and bypass. This patch adds blkg_root_update() to update root blkg in place on policy change. This is hacky and racy but should be good enough as interim step until we get locking simplified and switch over to proper in-place update for all blkgs. -v2: Root blkgs need to be updated on elvswitch too and blkg_alloc() comment wasn't updated according to the function change. Fixed. Both pointed out by Vivek. -v3: v2 updated blkg_destroy_all() to invoke update_root_blkg_pd() for all policies. This freed root pd during elvswitch before the last queue finished exiting and led to oops. Directly invoke update_root_blkg_pd() only on BLKIO_POLICY_PROP from cfq_exit_queue(). This also is closer to what will be done with proper in-place blkg update. Reported by Vivek. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2012-03-06 01:15:20 +04:00
extern void blkg_destroy_all(struct request_queue *q, bool destroy_root);
extern void update_root_blkg_pd(struct request_queue *q,
enum blkio_policy_id plid);
/**
* blkg_to_pdata - get policy private data
* @blkg: blkg of interest
* @pol: policy of interest
*
* Return pointer to private data associated with the @blkg-@pol pair.
*/
static inline void *blkg_to_pdata(struct blkio_group *blkg,
struct blkio_policy_type *pol)
{
return blkg ? blkg->pd[pol->plid]->pdata : NULL;
}
/**
* pdata_to_blkg - get blkg associated with policy private data
* @pdata: policy private data of interest
* @pol: policy @pdata is for
*
* @pdata is policy private data for @pol. Determine the blkg it's
* associated with.
*/
static inline struct blkio_group *pdata_to_blkg(void *pdata,
struct blkio_policy_type *pol)
{
if (pdata) {
struct blkg_policy_data *pd =
container_of(pdata, struct blkg_policy_data, pdata);
return pd->blkg;
}
return NULL;
}
static inline char *blkg_path(struct blkio_group *blkg)
{
return blkg->path;
}
/**
* blkg_get - get a blkg reference
* @blkg: blkg to get
*
* The caller should be holding queue_lock and an existing reference.
*/
static inline void blkg_get(struct blkio_group *blkg)
{
lockdep_assert_held(blkg->q->queue_lock);
WARN_ON_ONCE(!blkg->refcnt);
blkg->refcnt++;
}
void __blkg_release(struct blkio_group *blkg);
/**
* blkg_put - put a blkg reference
* @blkg: blkg to put
*
* The caller should be holding queue_lock.
*/
static inline void blkg_put(struct blkio_group *blkg)
{
lockdep_assert_held(blkg->q->queue_lock);
WARN_ON_ONCE(blkg->refcnt <= 0);
if (!--blkg->refcnt)
__blkg_release(blkg);
}
#else
struct blkio_group {
};
struct blkio_policy_type {
};
static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
static inline void blkcg_drain_queue(struct request_queue *q) { }
static inline void blkcg_exit_queue(struct request_queue *q) { }
static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
static inline void blkg_destroy_all(struct request_queue *q,
bool destory_root) { }
blkcg: unify blkg's for blkcg policies Currently, blkg is per cgroup-queue-policy combination. This is unnatural and leads to various convolutions in partially used duplicate fields in blkg, config / stat access, and general management of blkgs. This patch make blkg's per cgroup-queue and let them serve all policies. blkgs are now created and destroyed by blkcg core proper. This will allow further consolidation of common management logic into blkcg core and API with better defined semantics and layering. As a transitional step to untangle blkg management, elvswitch and policy [de]registration, all blkgs except the root blkg are being shot down during elvswitch and bypass. This patch adds blkg_root_update() to update root blkg in place on policy change. This is hacky and racy but should be good enough as interim step until we get locking simplified and switch over to proper in-place update for all blkgs. -v2: Root blkgs need to be updated on elvswitch too and blkg_alloc() comment wasn't updated according to the function change. Fixed. Both pointed out by Vivek. -v3: v2 updated blkg_destroy_all() to invoke update_root_blkg_pd() for all policies. This freed root pd during elvswitch before the last queue finished exiting and led to oops. Directly invoke update_root_blkg_pd() only on BLKIO_POLICY_PROP from cfq_exit_queue(). This also is closer to what will be done with proper in-place blkg update. Reported by Vivek. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2012-03-06 01:15:20 +04:00
static inline void update_root_blkg_pd(struct request_queue *q,
enum blkio_policy_id plid) { }
static inline void *blkg_to_pdata(struct blkio_group *blkg,
struct blkio_policy_type *pol) { return NULL; }
static inline struct blkio_group *pdata_to_blkg(void *pdata,
struct blkio_policy_type *pol) { return NULL; }
static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
static inline void blkg_get(struct blkio_group *blkg) { }
static inline void blkg_put(struct blkio_group *blkg) { }
#endif
#define BLKIO_WEIGHT_MIN 10
#define BLKIO_WEIGHT_MAX 1000
#define BLKIO_WEIGHT_DEFAULT 500
#ifdef CONFIG_DEBUG_BLK_CGROUP
void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol);
void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol,
unsigned long dequeue);
void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol);
void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol);
void blkiocg_set_start_empty_time(struct blkio_group *blkg,
struct blkio_policy_type *pol);
blkio: Add more debug-only per-cgroup stats 1) group_wait_time - This is the amount of time the cgroup had to wait to get a timeslice for one of its queues from when it became busy, i.e., went from 0 to 1 request queued. This is different from the io_wait_time which is the cumulative total of the amount of time spent by each IO in that cgroup waiting in the scheduler queue. This stat is a great way to find out any jobs in the fleet that are being starved or waiting for longer than what is expected (due to an IO controller bug or any other issue). 2) empty_time - This is the amount of time a cgroup spends w/o any pending requests. This stat is useful when a job does not seem to be able to use its assigned disk share by helping check if that is happening due to an IO controller bug or because the job is not submitting enough IOs. 3) idle_time - This is the amount of time spent by the IO scheduler idling for a given cgroup in anticipation of a better request than the exising ones from other queues/cgroups. All these stats are recorded using start and stop events. When reading these stats, we do not add the delta between the current time and the last start time if we're between the start and stop events. We avoid doing this to make sure that these numbers are always monotonically increasing when read. Since we're using sched_clock() which may use the tsc as its source, it may induce some inconsistency (due to tsc resync across cpus) if we included the current delta. Signed-off-by: Divyesh Shah<dpshah@google.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2010-04-09 08:15:35 +04:00
#define BLKG_FLAG_FNS(name) \
static inline void blkio_mark_blkg_##name( \
struct blkio_group_stats *stats) \
{ \
stats->flags |= (1 << BLKG_##name); \
} \
static inline void blkio_clear_blkg_##name( \
struct blkio_group_stats *stats) \
{ \
stats->flags &= ~(1 << BLKG_##name); \
} \
static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \
{ \
return (stats->flags & (1 << BLKG_##name)) != 0; \
} \
BLKG_FLAG_FNS(waiting)
BLKG_FLAG_FNS(idling)
BLKG_FLAG_FNS(empty)
#undef BLKG_FLAG_FNS
#else
static inline void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol) { }
static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol, unsigned long dequeue) { }
static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol) { }
static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol) { }
static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg,
struct blkio_policy_type *pol) { }
#endif
#ifdef CONFIG_BLK_CGROUP
extern struct blkio_cgroup blkio_root_cgroup;
extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
extern struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio);
blkcg: factor out blkio_group creation Currently both blk-throttle and cfq-iosched implement their own blkio_group creation code in throtl_get_tg() and cfq_get_cfqg(). This patch factors out the common code into blkg_lookup_create(), which returns ERR_PTR value so that transitional failures due to queue bypass can be distinguished from other failures. * New plkio_policy_ops methods blkio_alloc_group_fn() and blkio_link_group_fn added. Both are transitional and will be removed once the blkg management code is fully moved into blk-cgroup.c. * blkio_alloc_group_fn() allocates policy-specific blkg which is usually a larger data structure with blkg as the first entry and intiailizes it. Note that initialization of blkg proper, including percpu stats, is responsibility of blk-cgroup proper. Note that default config (weight, bps...) initialization is done from this method; otherwise, we end up violating locking order between blkcg and q locks via blkcg_get_CONF() functions. * blkio_link_group_fn() is called under queue_lock and responsible for linking the blkg to the queue. blkcg side is handled by blk-cgroup proper. * The common blkg creation function is named blkg_lookup_create() and blkiocg_lookup_group() is renamed to blkg_lookup() for consistency. Also, throtl / cfq related functions are similarly [re]named for consistency. This simplifies blkcg policy implementations and enables further cleanup. -v2: Vivek noticed that blkg_lookup_create() incorrectly tested blk_queue_dead() instead of blk_queue_bypass() leading a user of the function ending up creating a new blkg on bypassing queue. This is a bug introduced while relocating bypass patches before this one. Fixed. -v3: ERR_PTR patch folded into this one. @for_root added to blkg_lookup_create() to allow creating root group on a bypassed queue during elevator switch. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2012-03-06 01:15:06 +04:00
extern struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
blkcg: unify blkg's for blkcg policies Currently, blkg is per cgroup-queue-policy combination. This is unnatural and leads to various convolutions in partially used duplicate fields in blkg, config / stat access, and general management of blkgs. This patch make blkg's per cgroup-queue and let them serve all policies. blkgs are now created and destroyed by blkcg core proper. This will allow further consolidation of common management logic into blkcg core and API with better defined semantics and layering. As a transitional step to untangle blkg management, elvswitch and policy [de]registration, all blkgs except the root blkg are being shot down during elvswitch and bypass. This patch adds blkg_root_update() to update root blkg in place on policy change. This is hacky and racy but should be good enough as interim step until we get locking simplified and switch over to proper in-place update for all blkgs. -v2: Root blkgs need to be updated on elvswitch too and blkg_alloc() comment wasn't updated according to the function change. Fixed. Both pointed out by Vivek. -v3: v2 updated blkg_destroy_all() to invoke update_root_blkg_pd() for all policies. This freed root pd during elvswitch before the last queue finished exiting and led to oops. Directly invoke update_root_blkg_pd() only on BLKIO_POLICY_PROP from cfq_exit_queue(). This also is closer to what will be done with proper in-place blkg update. Reported by Vivek. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2012-03-06 01:15:20 +04:00
struct request_queue *q);
blkcg: factor out blkio_group creation Currently both blk-throttle and cfq-iosched implement their own blkio_group creation code in throtl_get_tg() and cfq_get_cfqg(). This patch factors out the common code into blkg_lookup_create(), which returns ERR_PTR value so that transitional failures due to queue bypass can be distinguished from other failures. * New plkio_policy_ops methods blkio_alloc_group_fn() and blkio_link_group_fn added. Both are transitional and will be removed once the blkg management code is fully moved into blk-cgroup.c. * blkio_alloc_group_fn() allocates policy-specific blkg which is usually a larger data structure with blkg as the first entry and intiailizes it. Note that initialization of blkg proper, including percpu stats, is responsibility of blk-cgroup proper. Note that default config (weight, bps...) initialization is done from this method; otherwise, we end up violating locking order between blkcg and q locks via blkcg_get_CONF() functions. * blkio_link_group_fn() is called under queue_lock and responsible for linking the blkg to the queue. blkcg side is handled by blk-cgroup proper. * The common blkg creation function is named blkg_lookup_create() and blkiocg_lookup_group() is renamed to blkg_lookup() for consistency. Also, throtl / cfq related functions are similarly [re]named for consistency. This simplifies blkcg policy implementations and enables further cleanup. -v2: Vivek noticed that blkg_lookup_create() incorrectly tested blk_queue_dead() instead of blk_queue_bypass() leading a user of the function ending up creating a new blkg on bypassing queue. This is a bug introduced while relocating bypass patches before this one. Fixed. -v3: ERR_PTR patch folded into this one. @for_root added to blkg_lookup_create() to allow creating root group on a bypassed queue during elevator switch. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2012-03-06 01:15:06 +04:00
struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
struct request_queue *q,
enum blkio_policy_id plid,
bool for_root);
void blkiocg_update_timeslice_used(struct blkio_group *blkg,
struct blkio_policy_type *pol,
unsigned long time,
unsigned long unaccounted_time);
void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol,
uint64_t bytes, bool direction, bool sync);
2010-04-09 10:31:19 +04:00
void blkiocg_update_completion_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol,
uint64_t start_time,
uint64_t io_start_time, bool direction,
bool sync);
void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol,
bool direction, bool sync);
void blkiocg_update_io_add_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol,
struct blkio_group *curr_blkg, bool direction,
bool sync);
void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol,
bool direction, bool sync);
#else
struct cgroup;
static inline struct blkio_cgroup *
cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
static inline struct blkio_cgroup *
bio_blkio_cgroup(struct bio *bio) { return NULL; }
blkcg: factor out blkio_group creation Currently both blk-throttle and cfq-iosched implement their own blkio_group creation code in throtl_get_tg() and cfq_get_cfqg(). This patch factors out the common code into blkg_lookup_create(), which returns ERR_PTR value so that transitional failures due to queue bypass can be distinguished from other failures. * New plkio_policy_ops methods blkio_alloc_group_fn() and blkio_link_group_fn added. Both are transitional and will be removed once the blkg management code is fully moved into blk-cgroup.c. * blkio_alloc_group_fn() allocates policy-specific blkg which is usually a larger data structure with blkg as the first entry and intiailizes it. Note that initialization of blkg proper, including percpu stats, is responsibility of blk-cgroup proper. Note that default config (weight, bps...) initialization is done from this method; otherwise, we end up violating locking order between blkcg and q locks via blkcg_get_CONF() functions. * blkio_link_group_fn() is called under queue_lock and responsible for linking the blkg to the queue. blkcg side is handled by blk-cgroup proper. * The common blkg creation function is named blkg_lookup_create() and blkiocg_lookup_group() is renamed to blkg_lookup() for consistency. Also, throtl / cfq related functions are similarly [re]named for consistency. This simplifies blkcg policy implementations and enables further cleanup. -v2: Vivek noticed that blkg_lookup_create() incorrectly tested blk_queue_dead() instead of blk_queue_bypass() leading a user of the function ending up creating a new blkg on bypassing queue. This is a bug introduced while relocating bypass patches before this one. Fixed. -v3: ERR_PTR patch folded into this one. @for_root added to blkg_lookup_create() to allow creating root group on a bypassed queue during elevator switch. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2012-03-06 01:15:06 +04:00
static inline struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
void *key) { return NULL; }
static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
struct blkio_policy_type *pol, unsigned long time,
unsigned long unaccounted_time) { }
2010-04-09 10:31:19 +04:00
static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol, uint64_t bytes,
bool direction, bool sync) { }
2010-04-09 10:31:19 +04:00
static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol, uint64_t start_time,
uint64_t io_start_time, bool direction, bool sync) { }
static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol, bool direction,
bool sync) { }
static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol,
struct blkio_group *curr_blkg, bool direction,
bool sync) { }
static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol, bool direction,
bool sync) { }
#endif
#endif /* _BLK_CGROUP_H */