Index: Linux-2.6.2/arch/alpha/Kconfig diff -u Linux-2.6.2/arch/alpha/Kconfig:1.1.1.1 Linux-2.6.2/arch/alpha/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/alpha/Kconfig:1.1.1.1 Fri Feb 6 12:02:41 2004 +++ Linux-2.6.2/arch/alpha/Kconfig Fri Feb 6 18:42:21 2004 @@ -566,6 +566,8 @@ Take the default (1) unless you want more control or more info. +source "kernel/Kconfig" + source "drivers/pci/Kconfig" source "drivers/eisa/Kconfig" Index: Linux-2.6.2/arch/arm/Kconfig diff -u Linux-2.6.2/arch/arm/Kconfig:1.1.1.1 Linux-2.6.2/arch/arm/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/arm/Kconfig:1.1.1.1 Fri Feb 6 12:02:41 2004 +++ Linux-2.6.2/arch/arm/Kconfig Fri Feb 6 18:42:21 2004 @@ -223,6 +223,7 @@ endmenu +source "kernel/Kconfig" menu "General setup" Index: Linux-2.6.2/arch/arm26/Kconfig diff -u Linux-2.6.2/arch/arm26/Kconfig:1.1.1.1 Linux-2.6.2/arch/arm26/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/arm26/Kconfig:1.1.1.1 Fri Feb 6 12:02:39 2004 +++ Linux-2.6.2/arch/arm26/Kconfig Fri Feb 6 18:42:21 2004 @@ -85,6 +85,8 @@ machine with 4MB of memory. endmenu +source "kernel/Kconfig" + menu "General setup" # Compressed boot loader in ROM. Yes, we really want to ask about Index: Linux-2.6.2/arch/cris/Kconfig diff -u Linux-2.6.2/arch/cris/Kconfig:1.1.1.1 Linux-2.6.2/arch/cris/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/cris/Kconfig:1.1.1.1 Fri Feb 6 12:02:39 2004 +++ Linux-2.6.2/arch/cris/Kconfig Fri Feb 6 18:42:21 2004 @@ -22,6 +22,7 @@ source "init/Kconfig" +source "kernel/Kconfig" menu "General setup" Index: Linux-2.6.2/arch/h8300/Kconfig diff -u Linux-2.6.2/arch/h8300/Kconfig:1.1.1.1 Linux-2.6.2/arch/h8300/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/h8300/Kconfig:1.1.1.1 Fri Feb 6 12:02:42 2004 +++ Linux-2.6.2/arch/h8300/Kconfig Fri Feb 6 18:42:21 2004 @@ -179,6 +179,8 @@ BLKDEV start address. endmenu +source "kernel/Kconfig" + menu "Executable file formats" source "fs/Kconfig.binfmt" Index: Linux-2.6.2/arch/i386/Kconfig diff -u Linux-2.6.2/arch/i386/Kconfig:1.1.1.1 Linux-2.6.2/arch/i386/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/i386/Kconfig:1.1.1.1 Fri Feb 6 12:02:36 2004 +++ Linux-2.6.2/arch/i386/Kconfig Fri Feb 6 18:42:21 2004 @@ -823,6 +823,7 @@ endmenu +source kernel/Kconfig menu "Power management options (ACPI, APM)" depends on !X86_VOYAGER Index: Linux-2.6.2/arch/ia64/Kconfig diff -u Linux-2.6.2/arch/ia64/Kconfig:1.1.1.1 Linux-2.6.2/arch/ia64/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/ia64/Kconfig:1.1.1.1 Fri Feb 6 12:02:40 2004 +++ Linux-2.6.2/arch/ia64/Kconfig Fri Feb 6 18:42:21 2004 @@ -96,6 +96,8 @@ endchoice +source "kernel/Kconfig" + choice prompt "Kernel page size" default IA64_PAGE_SIZE_16KB Index: Linux-2.6.2/arch/m68k/Kconfig diff -u Linux-2.6.2/arch/m68k/Kconfig:1.1.1.1 Linux-2.6.2/arch/m68k/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/m68k/Kconfig:1.1.1.1 Fri Feb 6 12:02:39 2004 +++ Linux-2.6.2/arch/m68k/Kconfig Fri Feb 6 18:42:21 2004 @@ -336,6 +336,7 @@ endmenu +source "kernel/Kconfig" menu "General setup" Index: Linux-2.6.2/arch/m68knommu/Kconfig diff -u Linux-2.6.2/arch/m68knommu/Kconfig:1.1.1.1 Linux-2.6.2/arch/m68knommu/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/m68knommu/Kconfig:1.1.1.1 Fri Feb 6 12:02:36 2004 +++ Linux-2.6.2/arch/m68knommu/Kconfig Fri Feb 6 18:42:20 2004 @@ -91,6 +91,8 @@ endchoice +source "kernel/Kconfig" + config COLDFIRE bool depends on (M5206 || M5206e || M5249 || M5272 || M5282 || M5307 || M5407) Index: Linux-2.6.2/arch/mips/Kconfig diff -u Linux-2.6.2/arch/mips/Kconfig:1.1.1.1 Linux-2.6.2/arch/mips/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/mips/Kconfig:1.1.1.1 Fri Feb 6 12:02:38 2004 +++ Linux-2.6.2/arch/mips/Kconfig Fri Feb 6 18:42:20 2004 @@ -1027,6 +1027,8 @@ endmenu +source "kernel/Kconfig" + menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)" config PCI Index: Linux-2.6.2/arch/parisc/Kconfig diff -u Linux-2.6.2/arch/parisc/Kconfig:1.1.1.1 Linux-2.6.2/arch/parisc/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/parisc/Kconfig:1.1.1.1 Fri Feb 6 12:02:42 2004 +++ Linux-2.6.2/arch/parisc/Kconfig Fri Feb 6 18:42:20 2004 @@ -160,6 +160,7 @@ endmenu +source "kernel/Kconfig" source "drivers/parisc/Kconfig" Index: Linux-2.6.2/arch/ppc/Kconfig diff -u Linux-2.6.2/arch/ppc/Kconfig:1.1.1.1 Linux-2.6.2/arch/ppc/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/ppc/Kconfig:1.1.1.1 Fri Feb 6 12:02:37 2004 +++ Linux-2.6.2/arch/ppc/Kconfig Fri Feb 6 18:42:20 2004 @@ -911,6 +911,8 @@ endmenu +source "kernel/Kconfig" + menu "Bus options" config ISA Index: Linux-2.6.2/arch/ppc64/Kconfig diff -u Linux-2.6.2/arch/ppc64/Kconfig:1.1.1.1 Linux-2.6.2/arch/ppc64/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/ppc64/Kconfig:1.1.1.1 Fri Feb 6 12:02:41 2004 +++ Linux-2.6.2/arch/ppc64/Kconfig Fri Feb 6 18:42:21 2004 @@ -167,6 +167,7 @@ endmenu +source "kernel/Kconfig" menu "General setup" Index: Linux-2.6.2/arch/s390/Kconfig diff -u Linux-2.6.2/arch/s390/Kconfig:1.1.1.1 Linux-2.6.2/arch/s390/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/s390/Kconfig:1.1.1.1 Fri Feb 6 12:02:43 2004 +++ Linux-2.6.2/arch/s390/Kconfig Fri Feb 6 18:42:20 2004 @@ -252,6 +252,8 @@ endmenu +source "kernel/Kconfig" + config PCMCIA bool default n Index: Linux-2.6.2/arch/sh/Kconfig diff -u Linux-2.6.2/arch/sh/Kconfig:1.1.1.1 Linux-2.6.2/arch/sh/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/sh/Kconfig:1.1.1.1 Fri Feb 6 12:02:39 2004 +++ Linux-2.6.2/arch/sh/Kconfig Fri Feb 6 18:42:21 2004 @@ -555,6 +555,7 @@ endmenu +source "kernel/Kconfig" menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)" Index: Linux-2.6.2/arch/sparc/Kconfig diff -u Linux-2.6.2/arch/sparc/Kconfig:1.1.1.1 Linux-2.6.2/arch/sparc/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/sparc/Kconfig:1.1.1.1 Fri Feb 6 12:02:35 2004 +++ Linux-2.6.2/arch/sparc/Kconfig Fri Feb 6 18:42:20 2004 @@ -23,6 +23,7 @@ source "init/Kconfig" +source "kernel/Kconfig" menu "General setup" Index: Linux-2.6.2/arch/sparc64/Kconfig diff -u Linux-2.6.2/arch/sparc64/Kconfig:1.1.1.1 Linux-2.6.2/arch/sparc64/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/sparc64/Kconfig:1.1.1.1 Fri Feb 6 12:02:36 2004 +++ Linux-2.6.2/arch/sparc64/Kconfig Fri Feb 6 18:42:21 2004 @@ -14,6 +14,7 @@ source "init/Kconfig" +source "kernel/Kconfig" menu "General setup" Index: Linux-2.6.2/arch/um/Kconfig diff -u Linux-2.6.2/arch/um/Kconfig:1.1.1.1 Linux-2.6.2/arch/um/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/um/Kconfig:1.1.1.1 Fri Feb 6 12:02:37 2004 +++ Linux-2.6.2/arch/um/Kconfig Fri Feb 6 18:42:21 2004 @@ -172,6 +172,8 @@ source "init/Kconfig" +source "kernel/Kconfig" + source "drivers/base/Kconfig" source "arch/um/Kconfig_char" Index: Linux-2.6.2/arch/v850/Kconfig diff -u Linux-2.6.2/arch/v850/Kconfig:1.1.1.1 Linux-2.6.2/arch/v850/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/v850/Kconfig:1.1.1.1 Fri Feb 6 12:02:42 2004 +++ Linux-2.6.2/arch/v850/Kconfig Fri Feb 6 18:42:20 2004 @@ -225,6 +225,8 @@ source init/Kconfig +source kernel/Kconfig + ############################################################################# menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)" Index: Linux-2.6.2/arch/x86_64/Kconfig diff -u Linux-2.6.2/arch/x86_64/Kconfig:1.1.1.1 Linux-2.6.2/arch/x86_64/Kconfig:1.1.1.1.8.1 --- Linux-2.6.2/arch/x86_64/Kconfig:1.1.1.1 Fri Feb 6 12:02:42 2004 +++ Linux-2.6.2/arch/x86_64/Kconfig Fri Feb 6 18:42:20 2004 @@ -77,6 +77,7 @@ source "init/Kconfig" +source "kernel/Kconfig" menu "Processor type and features" Index: Linux-2.6.2/fs/proc/array.c diff -u Linux-2.6.2/fs/proc/array.c:1.1.1.1 Linux-2.6.2/fs/proc/array.c:1.1.1.1.2.1 --- Linux-2.6.2/fs/proc/array.c:1.1.1.1 Fri Feb 6 12:02:35 2004 +++ Linux-2.6.2/fs/proc/array.c Fri Feb 6 16:31:04 2004 @@ -162,7 +162,7 @@ "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1000000000/1024), + (unsigned long)((EBS_ONE - p->cpu_rate_per_share * p->cpu_shares) / (EBS_ONE / 100)), p->tgid, p->pid, p->pid ? p->real_parent->pid : 0, p->pid && p->ptrace ? p->parent->pid : 0, Index: Linux-2.6.2/fs/proc/base.c diff -u Linux-2.6.2/fs/proc/base.c:1.1.1.1 Linux-2.6.2/fs/proc/base.c:1.1.1.1.2.1.4.1.6.2 --- Linux-2.6.2/fs/proc/base.c:1.1.1.1 Fri Feb 6 12:02:35 2004 +++ Linux-2.6.2/fs/proc/base.c Fri Feb 13 09:56:35 2004 @@ -67,6 +67,11 @@ PROC_TGID_ATTR_EXEC, PROC_TGID_ATTR_FSCREATE, #endif +#ifdef CONFIG_SCHED_STATS + PROC_TGID_CPU, +#endif + PROC_TGID_CPU_RATE_CAP, + PROC_TGID_CPU_SHARES, PROC_TGID_FD_DIR, PROC_TID_INO, PROC_TID_STATUS, @@ -90,6 +95,11 @@ PROC_TID_ATTR_EXEC, PROC_TID_ATTR_FSCREATE, #endif +#ifdef CONFIG_SCHED_STATS + PROC_TID_CPU, +#endif + PROC_TID_CPU_RATE_CAP, + PROC_TID_CPU_SHARES, PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */ }; @@ -123,6 +133,11 @@ #ifdef CONFIG_KALLSYMS E(PROC_TGID_WCHAN, "wchan", S_IFREG|S_IRUGO), #endif +#ifdef CONFIG_SCHED_STATS + E(PROC_TGID_CPU, "cpu", S_IFREG|S_IRUGO), +#endif + E(PROC_TGID_CPU_RATE_CAP,"cpu_rate_cap", S_IFREG|S_IRUGO|S_IWUSR), + E(PROC_TGID_CPU_SHARES, "cpu_shares", S_IFREG|S_IRUGO|S_IWUSR), {0,0,NULL,0} }; static struct pid_entry tid_base_stuff[] = { @@ -145,6 +160,11 @@ #ifdef CONFIG_KALLSYMS E(PROC_TID_WCHAN, "wchan", S_IFREG|S_IRUGO), #endif +#ifdef CONFIG_SCHED_STATS + E(PROC_TID_CPU, "cpu", S_IFREG|S_IRUGO), +#endif + E(PROC_TID_CPU_RATE_CAP,"cpu_rate_cap", S_IFREG|S_IRUGO|S_IWUSR), + E(PROC_TID_CPU_SHARES, "cpu_shares", S_IFREG|S_IRUGO|S_IWUSR), {0,0,NULL,0} }; @@ -180,7 +200,9 @@ int proc_pid_stat(struct task_struct*,char*); int proc_pid_status(struct task_struct*,char*); int proc_pid_statm(struct task_struct*,char*); +#ifdef CONFIG_SCHED_STATS int proc_pid_cpu(struct task_struct*,char*); +#endif static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { @@ -556,6 +578,125 @@ .read = proc_info_read, }; +/* + * Entitlement Based Scheduler (EBS) per task parameters + */ +static ssize_t cpu_rate_cap_read(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = proc_task(file->f_dentry->d_inode); + char buffer[64]; + size_t len; + uint32_t enu = task->cpu_rate_cap; + uint32_t den = EBS_ONE; + char *qual = (EBS_CPU_RATE_CAP_IS_HARD & task->cpu_ebs_flags) ? " !" : ""; + int i; + + if (*ppos) + return 0; + for (i = 0; (i < EBS_OFFSET) && !(enu & 1); i++) { + enu >>= 1; + den >>= 1; + } + *ppos = len = sprintf(buffer, "%u / %u%s\n", enu, den, qual); + if (copy_to_user(buf, buffer, len)) + return -EFAULT; + + return len; +} + +static ssize_t cpu_rate_cap_write(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = proc_task(file->f_dentry->d_inode); + char buffer[128] = ""; + char *endptr = NULL; + char *bptr = buffer; + unsigned long enu, den, hard = 0; + int res; + + if ((count > 127) || *ppos) + return -EFBIG; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + enu = simple_strtoul(buffer, &endptr, 0); + if ((endptr == buffer) || (enu == ULONG_MAX)) + return -EINVAL; + while (('\0' != *endptr) && ((' ' == *endptr) || ('\t' == *endptr))) + endptr++; + if ('/' != *endptr) + return -EINVAL; + endptr++; + bptr = endptr; + den = simple_strtoul(bptr, &endptr, 0); + if ((endptr == bptr) || (den == ULONG_MAX)) + return -EINVAL; + while (('\0' != *endptr) && ((' ' == *endptr) || ('\t' == *endptr) || ('\n' == *endptr))) + endptr++; + if ('!' == *endptr) { + hard = 1; + endptr++; + } + while (('\0' != *endptr) && ((' ' == *endptr) || ('\t' == *endptr) || ('\n' == *endptr))) + endptr++; + if ('\0' != *endptr) + return -EINVAL; + + if (0 != (res = set_cpu_rate_cap_fm_frac(task, enu, den, hard))) + return res; + + return count; +} + +static struct file_operations proc_cpu_rate_cap_operations = { + read: cpu_rate_cap_read, + write: cpu_rate_cap_write, +}; + +static ssize_t cpu_shares_read(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = proc_task(file->f_dentry->d_inode); + char buffer[64]; + size_t len; + + if (*ppos) + return 0; + *ppos = len = sprintf(buffer, "%u\n", task->cpu_shares); + if (copy_to_user(buf, buffer, len)) + return -EFAULT; + + return len; +} + +static ssize_t cpu_shares_write(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = proc_task(file->f_dentry->d_inode); + char buffer[64] = ""; + char *endptr = NULL; + unsigned long shares; + int res; + + if ((count > 63) || *ppos) + return -EFBIG; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + shares = simple_strtoul(buffer, &endptr, 0); + if ((endptr == buffer) || (shares == ULONG_MAX)) + return -EINVAL; + + if (0 != (res = set_cpu_shares(task, shares))) + return res; + + return count; +} + +static struct file_operations proc_cpu_shares_operations = { + read: cpu_shares_read, + write: cpu_shares_write, +}; + static int mem_open(struct inode* inode, struct file* file) { file->private_data = (void*)((long)current->self_exec_id); @@ -1385,11 +1526,26 @@ inode->i_fop = &proc_pid_attr_operations; break; #endif + case PROC_TGID_CPU_RATE_CAP: + case PROC_TID_CPU_RATE_CAP: + inode->i_fop = &proc_cpu_rate_cap_operations; + break; + case PROC_TGID_CPU_SHARES: + case PROC_TID_CPU_SHARES: + inode->i_fop = &proc_cpu_shares_operations; + break; #ifdef CONFIG_KALLSYMS case PROC_TID_WCHAN: case PROC_TGID_WCHAN: inode->i_fop = &proc_info_file_operations; ei->op.proc_read = proc_pid_wchan; + break; +#endif +#ifdef CONFIG_SCHED_STATS + case PROC_TID_CPU: + case PROC_TGID_CPU: + inode->i_fop = &proc_info_file_operations; + ei->op.proc_read = proc_pid_cpu; break; #endif default: Index: Linux-2.6.2/fs/proc/proc_misc.c diff -u Linux-2.6.2/fs/proc/proc_misc.c:1.1.1.1 Linux-2.6.2/fs/proc/proc_misc.c:1.1.1.1.12.1 --- Linux-2.6.2/fs/proc/proc_misc.c:1.1.1.1 Fri Feb 6 12:02:35 2004 +++ Linux-2.6.2/fs/proc/proc_misc.c Sat Feb 7 23:06:40 2004 @@ -361,7 +361,8 @@ int i; extern unsigned long total_forks; u64 jif; - unsigned int sum = 0, user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0; + unsigned long long sum = 0, user = 0, nice = 0, system = 0, idle = 0, + iowait = 0, irq = 0, softirq = 0; struct timeval now; unsigned long seq; @@ -392,24 +393,24 @@ sum += kstat_cpu(i).irqs[j]; } - seq_printf(p, "cpu %u %u %u %u %u %u %u\n", - jiffies_to_clock_t(user), - jiffies_to_clock_t(nice), - jiffies_to_clock_t(system), - jiffies_to_clock_t(idle), - jiffies_to_clock_t(iowait), - jiffies_to_clock_t(irq), - jiffies_to_clock_t(softirq)); + seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu\n", + jiffies_64_to_clock_t(user), + jiffies_64_to_clock_t(nice), + jiffies_64_to_clock_t(system), + jiffies_64_to_clock_t(idle), + jiffies_64_to_clock_t(iowait), + jiffies_64_to_clock_t(irq), + jiffies_64_to_clock_t(softirq)); for_each_online_cpu(i) { - seq_printf(p, "cpu%d %u %u %u %u %u %u %u\n", + seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu\n", i, - jiffies_to_clock_t(kstat_cpu(i).cpustat.user), - jiffies_to_clock_t(kstat_cpu(i).cpustat.nice), - jiffies_to_clock_t(kstat_cpu(i).cpustat.system), - jiffies_to_clock_t(kstat_cpu(i).cpustat.idle), - jiffies_to_clock_t(kstat_cpu(i).cpustat.iowait), - jiffies_to_clock_t(kstat_cpu(i).cpustat.irq), - jiffies_to_clock_t(kstat_cpu(i).cpustat.softirq)); + jiffies_64_to_clock_t(kstat_cpu(i).cpustat.user), + jiffies_64_to_clock_t(kstat_cpu(i).cpustat.nice), + jiffies_64_to_clock_t(kstat_cpu(i).cpustat.system), + jiffies_64_to_clock_t(kstat_cpu(i).cpustat.idle), + jiffies_64_to_clock_t(kstat_cpu(i).cpustat.iowait), + jiffies_64_to_clock_t(kstat_cpu(i).cpustat.irq), + jiffies_64_to_clock_t(kstat_cpu(i).cpustat.softirq)); } seq_printf(p, "intr %u", sum); @@ -652,6 +653,11 @@ entry->proc_fops = f; } +#ifdef CONFIG_SCHED_STATS +extern int cpustats_read_proc(char *page, char **start, off_t off, int count, + int *eof, void *data); +#endif + void __init proc_misc_init(void) { struct proc_dir_entry *entry; @@ -677,6 +683,9 @@ #endif {"locks", locks_read_proc}, {"execdomains", execdomains_read_proc}, +#ifdef CONFIG_SCHED_STATS + {"cpustats", cpustats_read_proc}, +#endif {NULL,} }; for (p = simple_ones; p->name; p++) Index: Linux-2.6.2/fs/proc/root.c diff -u Linux-2.6.2/fs/proc/root.c:1.1.1.1 Linux-2.6.2/fs/proc/root.c:1.1.1.1.8.1.2.1 --- Linux-2.6.2/fs/proc/root.c:1.1.1.1 Fri Feb 6 12:02:35 2004 +++ Linux-2.6.2/fs/proc/root.c Sat Feb 7 21:06:38 2004 @@ -78,6 +78,12 @@ proc_rtas_init(); #endif proc_bus = proc_mkdir("bus", 0); +#ifdef CONFIG_SCHED_DYNAMIC_TIME_SLICE + init_cpu_time_slice_file(); +#endif +#ifdef CONFIG_SCHED_DYNAMIC_HALF_LIFE + init_cpu_half_life_file(); +#endif } static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd) Index: Linux-2.6.2/include/linux/init_task.h diff -u Linux-2.6.2/include/linux/init_task.h:1.1.1.1 Linux-2.6.2/include/linux/init_task.h:1.1.1.1.2.1.4.1.6.1 --- Linux-2.6.2/include/linux/init_task.h:1.1.1.1 Fri Feb 6 12:02:59 2004 +++ Linux-2.6.2/include/linux/init_task.h Sat Feb 7 23:06:40 2004 @@ -59,7 +59,21 @@ /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) + * Offset initial values for jiffies means that we can't rely on the default + * zero setting for all time stamps */ +#ifdef CONFIG_SMP +#define INIT_TASK_MEM_CACHE_TIMESTAMP() .mem_cache_timestamp = INITIAL_JIFFIES, +#else +#define INIT_TASK_MEM_CACHE_TIMESTAMP() /* do nothing */ +#endif + +#ifdef CONFIG_SCHED_STATS +#define INIT_TASK_RUNNABLE_TIMESTAMP() .runnable_timestamp = INITIAL_JIFFIES, +#else +#define INIT_TASK_RUNNABLE_TIMESTAMP() /* do nothing */ +#endif + #define INIT_TASK(tsk) \ { \ .state = 0, \ @@ -67,13 +81,20 @@ .usage = ATOMIC_INIT(2), \ .flags = 0, \ .lock_depth = -1, \ - .prio = MAX_PRIO-20, \ - .static_prio = MAX_PRIO-20, \ + .nice = 0, \ .policy = SCHED_NORMAL, \ + .cpu_shares = EBS_DEFAULT_SHARES, \ + .cpu_rate_cap = EBS_ONE, \ + .cpu_ebs_flags = EBS_FLAGS_DEFAULT, \ + .cpu_incr_per_tick = EBS_INIT_INCR_PER_SHARE(EBS_DEFAULT_SHARES), \ + .cpu_rate_cap_per_share = (EBS_ONE / EBS_DEFAULT_SHARES), \ + .cpu_rate_timestamp = INITIAL_JIFFIES, \ + .start_time = INITIAL_JIFFIES, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ + .prom_list = LIST_HEAD_INIT(tsk.prom_list), \ .time_slice = HZ, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ @@ -84,6 +105,9 @@ .sibling = LIST_HEAD_INIT(tsk.sibling), \ .group_leader = &tsk, \ .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\ + .sinbin_timer = { \ + .function = ebs_sinbin_fn \ + }, \ .real_timer = { \ .function = it_real_fn \ }, \ @@ -108,6 +132,8 @@ .proc_lock = SPIN_LOCK_UNLOCKED, \ .switch_lock = SPIN_LOCK_UNLOCKED, \ .journal_info = NULL, \ + INIT_TASK_MEM_CACHE_TIMESTAMP() \ + INIT_TASK_RUNNABLE_TIMESTAMP() \ } Index: Linux-2.6.2/include/linux/kernel_stat.h diff -u Linux-2.6.2/include/linux/kernel_stat.h:1.1.1.1 Linux-2.6.2/include/linux/kernel_stat.h:1.1.1.1.12.1 --- Linux-2.6.2/include/linux/kernel_stat.h:1.1.1.1 Fri Feb 6 12:03:00 2004 +++ Linux-2.6.2/include/linux/kernel_stat.h Sat Feb 7 23:06:40 2004 @@ -14,13 +14,16 @@ */ struct cpu_usage_stat { - unsigned int user; - unsigned int nice; - unsigned int system; - unsigned int softirq; - unsigned int irq; - unsigned int idle; - unsigned int iowait; + unsigned long long user; + unsigned long long nice; + unsigned long long system; +#ifdef CONFIG_SCHED_STATS + unsigned long long runnable; +#endif + unsigned long long softirq; + unsigned long long irq; + unsigned long long idle; + unsigned long long iowait; }; struct kernel_stat { Index: Linux-2.6.2/include/linux/list.h diff -u Linux-2.6.2/include/linux/list.h:1.1.1.1 Linux-2.6.2/include/linux/list.h:1.1.1.1.2.1 --- Linux-2.6.2/include/linux/list.h:1.1.1.1 Fri Feb 6 12:02:59 2004 +++ Linux-2.6.2/include/linux/list.h Fri Feb 6 16:31:04 2004 @@ -252,6 +252,50 @@ } /** + * __list_extract_slice - extracts a slice from a given list + * @slice: the new sub-list to return + * @head: head of the slice to extract + * @tail: tail of the slice to extract + * + * Assumes head and tail are not empty and are in the same list. + */ +static inline void __list_extract_slice(struct list_head *slice, + struct list_head *head, + struct list_head *tail) +{ + struct list_head *before = head->prev; + struct list_head *after = tail->next; + + slice->next = head; + head->prev = slice; + slice->prev = tail; + tail->next = slice; + + before->next = after; + after->prev = before; +} + +/* + * __list_extract_slice - extracts a slice from a given list + * @slice: the new sub-list to return + * @head: head of the slice to extract + * @tail: tail of the slice to extract + * + * Assumes head and tail are in the same list. + */ +static inline void list_extract_slice(struct list_head *slice, + struct list_head *head, + struct list_head *tail) +{ + if (list_empty(head)) { + INIT_LIST_HEAD(slice); + return; + } + + __list_extract_slice(slice, head, tail); +} + +/** * list_splice_init - join two lists and reinitialise the emptied list. * @list: the new list to add. * @head: the place to add it in the first list. Index: Linux-2.6.2/include/linux/sched.h diff -u Linux-2.6.2/include/linux/sched.h:1.1.1.1 Linux-2.6.2/include/linux/sched.h:1.1.1.1.2.1.2.1.2.1.2.1.2.1.2.1 --- Linux-2.6.2/include/linux/sched.h:1.1.1.1 Fri Feb 6 12:02:59 2004 +++ Linux-2.6.2/include/linux/sched.h Sat Feb 7 23:06:40 2004 @@ -54,12 +54,13 @@ #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ #define CLONE_STOPPED 0x02000000 /* Start in stopped state */ +#define CLONE_RESET_CPU_USAGE 0x04000000 /* Don't inherit parent's CPU usage rate */ /* * List of flags we want to share for kernel threads, * if only because they are not used by them anyway. */ -#define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND) +#define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND | CLONE_RESET_CPU_USAGE) /* * These are the constant used to fake the fixed-point load-average @@ -283,7 +284,7 @@ #define MAX_PRIO (MAX_RT_PRIO + 40) -#define rt_task(p) ((p)->prio < MAX_RT_PRIO) +#define rt_task(p) ((p)->policy != SCHED_NORMAL) /* * Some day this will be a full-fledged user tracking system.. @@ -338,18 +339,36 @@ int lock_depth; /* Lock depth */ - int prio, static_prio; struct list_head run_list; - prio_array_t *array; + struct list_head prom_list; - unsigned long sleep_avg; - long interactive_credit; - unsigned long long timestamp; - int activated; + unsigned int cpu_ebs_flags; + uint32_t cpu_rate_per_share; + /* + * These next two fields hold (relatively) constant values that depend + * on the number of shares the task has and its CPU cap + */ + uint32_t cpu_incr_per_tick; + uint32_t cpu_rate_cap_per_share; + unsigned int time_slice; + unsigned long cpu_rate_timestamp; + +#ifdef CONFIG_SMP + unsigned long mem_cache_timestamp; +#endif + struct timer_list sinbin_timer; + unsigned long sinbin_timestamp; +#ifdef CONFIG_SMP + unsigned long per_cpu_sinbin_ticks[NR_CPUS]; +#else + unsigned long sinbin_ticks; +#endif + unsigned int cpu_shares; + uint32_t cpu_rate_cap; + int nice; unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; struct list_head tasks; struct list_head ptrace_children; @@ -397,6 +416,17 @@ struct list_head posix_timers; /* POSIX.1b Interval Timers */ unsigned long utime, stime, cutime, cstime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; /* context switch counts */ +#ifdef CONFIG_SCHED_STATS + unsigned long runnable_timestamp; +#ifdef CONFIG_SMP + unsigned long long per_cpu_utime[NR_CPUS]; + unsigned long long per_cpu_stime[NR_CPUS]; + unsigned long long per_cpu_slices[NR_CPUS]; + unsigned long long per_cpu_runnable[NR_CPUS]; +#else + unsigned long long slices, runnable; +#endif +#endif u64 start_time; /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; @@ -500,6 +530,77 @@ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ +/* + * Entitlement based scheduling (EBS) only applies to SCHED_NORMAL tasks and + * therefore only uses the SCHED_NORMAL priority slots + */ +#define EBS_MIN_PRI MAX_RT_PRIO +#define EBS_MAX_PRI (MAX_PRIO - 2) +#define EBS_BGND_PRI (MAX_PRIO - 1) +#define EBS_SLOPE ((EBS_MAX_PRI - EBS_MIN_PRI) / 2) +#define EBS_MEAN_PRI ((EBS_MAX_PRI + EBS_MIN_PRI) / 2) +/* + * Share range is chosen to map cleanly onto nice values and provide roughly + * the same power (20 times) above and below the default. Going much beyond + * this would require FULL 64 bit arithmetic which is not available on all + * systems + */ +#define EBS_MIN_SHARES 1 +#define EBS_MAX_SHARES 420 +#define EBS_DEFAULT_SHARES 20 +#define EBS_NICE_TO_SHARES(n) \ + ((n >= 0) ? (EBS_DEFAULT_SHARES - n) : (EBS_DEFAULT_SHARES + (n * n))) +/* + * Denominator for rational numbers is chosen so as to give the maximum + * resolution while at the same time covering the required range of values + */ +#define EBS_OFFSET 27 +#define EBS_ONE ((uint32_t)1 << EBS_OFFSET) +/* + * The half life is a key control value for the EBS scheduler and is set at + * compile time (may become runtime settable at some future date) + */ +#define EBS_INIT_HALF_LIFE_MSECS ((uint32_t)CONFIG_SCHED_HALF_LIFE) +#define EBS_INIT_HALF_LIFE_TICKS ((EBS_INIT_HALF_LIFE_MSECS * HZ) / 1000) +/* + * decay per tick is exp(ln(0.5) / "half life in ticks") + * Evaluated using polynomila approximation with coefficients from + * "Handbook of Mathematical Functions" by Abromowitz and Stegun (1972) + * Equation 4.2.45, page 71 + * For maximum accuracy do this with 2**32 as denominator and then convert the + * final result to a value consistent with 2 ** EBS_OFFSET as denominator + */ +#define EBS_LOG_2 ((uint32_t)0xb17217f7) +#define EBS_A_1 ((uint64_t)0xfffffffd) +#define EBS_A_2 ((uint64_t)0x7ffffeaa) +#define EBS_A_3 ((uint64_t)0x2aa993c5) +#define EBS_A_4 ((uint64_t)0x0aaa0e51) +#define EBS_A_5 ((uint64_t)0x022009b4) +#define EBS_A_6 ((uint64_t)0x005727b7) +#define EBS_A_7 ((uint64_t)0x000942e4) +#define EBS_MUL_32(a, b) (((uint64_t)(a) * (uint64_t)(b)) >> 32) +#define EBS_POLY(x) \ +(uint32_t)((EBS_MUL_32(x, \ + EBS_A_1 - EBS_MUL_32(x, \ + EBS_A_2 - EBS_MUL_32(x, \ + EBS_A_3 - EBS_MUL_32(x, \ + EBS_A_4 - EBS_MUL_32(x, \ + EBS_A_5 - EBS_MUL_32(x, \ + EBS_A_6 - EBS_MUL_32(x, \ + EBS_A_7)))))))) \ + >> (32 - EBS_OFFSET)) +#define EBS_INIT_INCR_PER_TICK (EBS_POLY(EBS_LOG_2 / EBS_INIT_HALF_LIFE_TICKS)) +#define EBS_INIT_DECAY_PER_TICK (EBS_ONE - EBS_INIT_INCR_PER_TICK) +#define EBS_INIT_INCR_PER_SHARE(s) (EBS_INIT_INCR_PER_TICK / s) + +/* + * Flags for cpu rate capping + */ +#define EBS_CPU_RATE_CAP_IS_HARD (1<<0) +#define EBS_CPU_RATE_CAP_SINBIN (1<<1) +#define EBS_NEEDS_PRIO_RECALC (1<<2) +#define EBS_FLAGS_DEFAULT 0 + #ifdef CONFIG_SMP extern int set_cpus_allowed(task_t *p, cpumask_t new_mask); #else @@ -579,8 +680,7 @@ static inline void kick_process(struct task_struct *tsk) { } #endif extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk)); -extern void FASTCALL(sched_fork(task_t * p)); -extern void FASTCALL(sched_exit(task_t * p)); +extern void FASTCALL(sched_fork(task_t * p, unsigned long clone_flags)); asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru); @@ -905,6 +1005,58 @@ } #endif /* CONFIG_SMP */ + +/* + * Function for sinbin timer -- releases a task from the sinbin + */ +void ebs_sinbin_fn(unsigned long arg); + +/* + * Require: (0x7fffffff >= den > 0) and (enu <= den and ((den != 0) or (not hard)) + */ +int set_cpu_rate_cap_fm_frac(struct task_struct *p, uint32_t enu, uint32_t den, int hard); +/* + * Require: 0 <= cap <= EBS_ONE and ((cap != 0) or (not hard)) + */ +int set_cpu_rate_cap(struct task_struct *p, uint32_t new_cap, int hard); +/* + * Require: 1 <= shares <= EBS_MAX_SHARES + */ +int set_cpu_shares(struct task_struct *p, unsigned int shares); + +#ifdef CONFIG_SCHED_DYNAMIC_TIME_SLICE +/* + * Require: 1 <= time_slice_msecs <= EBS_MAX_TIME_SLICE + */ +int set_cpu_time_slice_msecs(unsigned int time_slice_msecs); + +/* + * Return the value of CPU time slice in msecs + */ +unsigned int get_cpu_time_slice_msecs(void); + +#ifdef CONFIG_PROC_FS +void init_cpu_time_slice_file(void); +#endif +#endif + +#ifdef CONFIG_SCHED_DYNAMIC_HALF_LIFE +#define EBS_MIN_HALF_LIFE_MSECS 1000 +#define EBS_MAX_HALF_LIFE_MSECS 100000 +/* + * Require: EBS_MIN_HALF_LIFE_MSECS <= half_life_msecs <= EBS_MAX_HALF_LIFE_MSECS + */ +int set_cpu_half_life_msecs(unsigned int half_life_msecs); + +/* + * Return the value of CPU half life in msecs + */ +unsigned int get_cpu_half_life_msecs(void); + +#ifdef CONFIG_PROC_FS +void init_cpu_half_life_file(void); +#endif +#endif #endif /* __KERNEL__ */ Index: Linux-2.6.2/kernel/Kconfig diff -u /dev/null Linux-2.6.2/kernel/Kconfig:1.1.2.1.2.1.2.2 --- /dev/null Mon Feb 23 13:32:49 2004 +++ Linux-2.6.2/kernel/Kconfig Wed Feb 18 18:13:27 2004 @@ -0,0 +1,70 @@ +# +# CPU scheduling options +# +menu "CPU Scheduling" + +config SCHED_HALF_LIFE + int "CPU scheduler response half life (msecs)" + default "5000" + range 1000 100000 + ---help--- + CPU Scheduler Response Half Life + Determines how quickly the CPU scheduler responds to changes in CPU demand. + Can be used to tune system performance - Range 1000 to 100000 + +config SCHED_DYNAMIC_HALF_LIFE + bool "Dynamic CPU scheduler response half life" + default y + ---help--- + Dynamic CPU Scheduler Response Half Life + Saying yes here allows the CPU scheduler half life to be set dynamically on a + running system. + +config SCHED_DEFAULT_TIME_SLICE + int "CPU scheduler default time slice (msecs)" + default "100" + range 10 500 + ---help--- + CPU Scheduler Default Time Slice + Determines how long the CPU scheduler leaves a task on the CPU before switching + to the next runnable task. + Can be used to tune system performance - Range 10 to 500 + +config SCHED_DYNAMIC_TIME_SLICE + bool "CPU scheduler dynamic time slice setting" + default y + ---help--- + CPU Scheduler Dynamic Time Slice Setting + Saying yes here will allow size the time slice received by tasks to be altered + dynamically on a running system + +config SCHED_STATS + bool "CPU scheduler statistics" + default y + ---help--- + CPU Scheduler Statistics + If you say yes here you get per CPU and per task scheduler + statistics (including time spent on run queues). + + Per CPU statistics are displayed in the file /proc/cpustats. The + first line contains the total for all CPUs on the system in the + following format: + + cpu + @ + + and subsequent lines hold the same statistics for each individual + CPU minus the timestamp which is valid for the entire file. + + Per task statistics are displayed in the file /proc//cpu. The + first line contains the totals (for all CPUs) for this task in the + following format: + + cpu + @ + + and subsequent lines hold the same statistics for each individual + CPU minus sleep-ticks (which can't sensibly be attributed to any + CPU) and the timestamp which is valid for the whole file. + +endmenu Index: Linux-2.6.2/kernel/exit.c diff -u Linux-2.6.2/kernel/exit.c:1.1.1.1 Linux-2.6.2/kernel/exit.c:1.1.1.1.2.1 --- Linux-2.6.2/kernel/exit.c:1.1.1.1 Fri Feb 6 12:03:09 2004 +++ Linux-2.6.2/kernel/exit.c Fri Feb 6 16:31:03 2004 @@ -95,7 +95,6 @@ p->parent->cnswap += p->nswap + p->cnswap; p->parent->cnvcsw += p->nvcsw + p->cnvcsw; p->parent->cnivcsw += p->nivcsw + p->cnivcsw; - sched_exit(p); write_unlock_irq(&tasklist_lock); spin_unlock(&p->proc_lock); proc_pid_flush(proc_dentry); Index: Linux-2.6.2/kernel/fork.c diff -u Linux-2.6.2/kernel/fork.c:1.1.1.1 Linux-2.6.2/kernel/fork.c:1.1.1.1.2.1 --- Linux-2.6.2/kernel/fork.c:1.1.1.1 Fri Feb 6 12:03:09 2004 +++ Linux-2.6.2/kernel/fork.c Fri Feb 6 16:31:03 2004 @@ -978,7 +978,7 @@ p->pdeath_signal = 0; /* Perform scheduler related setup */ - sched_fork(p); + sched_fork(p, clone_flags); /* * Ok, make it visible to the rest of the system. Index: Linux-2.6.2/kernel/sched.c diff -u Linux-2.6.2/kernel/sched.c:1.1.1.1 Linux-2.6.2/kernel/sched.c:1.1.1.1.2.1.2.1.2.1.2.1.2.1.2.1 --- Linux-2.6.2/kernel/sched.c:1.1.1.1 Fri Feb 6 12:03:09 2004 +++ Linux-2.6.2/kernel/sched.c Sat Feb 7 23:06:40 2004 @@ -15,6 +15,8 @@ * and per-CPU runqueues. Cleanups and useful suggestions * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2003-12-13 Entitlement based scheduling by Peter Williams, John Lee, + * and Kingsley Cheung. */ #include @@ -37,6 +39,8 @@ #include #include #include +#include +#include #ifdef CONFIG_NUMA #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) @@ -44,137 +48,342 @@ #define cpu_to_node_mask(cpu) (cpu_online_map) #endif +#define TASK_NICE(p) (p)->nice + +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p) < (rq)->current_prio_slot->prio) + +static unsigned int time_slice_ticks = \ + ((CONFIG_SCHED_DEFAULT_TIME_SLICE * HZ) / 1000) ? \ + ((CONFIG_SCHED_DEFAULT_TIME_SLICE * HZ) / 1000) : 1; + +#define EBS_MAX_TIME_SLICE 500 /* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], - * and back. + * task_timeslice() is the interface that is used by the scheduler. + * TODO: modify time slice when system gets busy */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) -#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) +static inline unsigned int task_timeslice(task_t *p) +{ + return time_slice_ticks; +} +#ifdef CONFIG_SCHED_DYNAMIC_TIME_SLICE /* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. + * Require: 1 <= time_slice_msecs <= EBS_MAX_TIME_SLICE */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) -#define AVG_TIMESLICE (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\ - (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1))) +int set_cpu_time_slice_msecs(unsigned int time_slice_msecs) +{ + unsigned int new_time_slice; + + if ((EBS_MAX_TIME_SLICE < time_slice_msecs) || (1 > time_slice_msecs)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + new_time_slice = ((time_slice_msecs * HZ) / 1000); + /* + * Assignment should be atomic so there's no need for locking + */ + time_slice_ticks = new_time_slice ? new_time_slice : 1; + + return 0; +} /* - * Some helpers for converting nanosecond timing to jiffy resolution + * Return the value of CPU time slice in msecs */ -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +unsigned int get_cpu_time_slice_msecs(void) +{ + return (time_slice_ticks * 1000) / HZ; +} + +EXPORT_SYMBOL(set_cpu_time_slice_msecs); +EXPORT_SYMBOL(get_cpu_time_slice_msecs); + +#ifdef CONFIG_PROC_FS +static int +read_cpu_time_slice_fn(char *page, char **start, off_t off, int count, int *eof, void *data) +{ + int len; + + len = sprintf(page, "%u\n", get_cpu_time_slice_msecs()); + + return len; +} + +static int +write_cpu_time_slice_fn(struct file *file, const char __user *buffer, unsigned long count, void *data) +{ + char kbuf[32] = ""; + char *endptr = NULL; + unsigned long time_slice_msecs; + int res; + + if (count > 32) + return -EFBIG; + if (copy_from_user(kbuf, buffer, count)) + return -EFAULT; + time_slice_msecs = simple_strtoul(kbuf, &endptr, 0); + if ((endptr == kbuf) || (time_slice_msecs == ULONG_MAX)) + return -EINVAL; + + res = set_cpu_time_slice_msecs(time_slice_msecs); + + return res ? res : count; +} + +static struct proc_dir_entry *cpu_time_slice_file = NULL; + +void __init init_cpu_time_slice_file(void) +{ + if (!(cpu_time_slice_file = create_proc_entry("cpu_time_slice", 0644, NULL))) + return; + cpu_time_slice_file->read_proc = read_cpu_time_slice_fn; + cpu_time_slice_file->write_proc = write_cpu_time_slice_fn; +} +#endif +#endif /* - * These are the 'tuning knobs' of the scheduler: - * - * Minimum timeslice is 10 msecs, default timeslice is 100 msecs, - * maximum timeslice is 200 msecs. Timeslices get refilled after - * they expire. + * Get time elapsed allowing for jiffies wrap + * 1. the true type for ts should be an unsigned long or this may break + * 2. should only be used where interval is expected to be <= ULONG_MAX */ -#define MIN_TIMESLICE ( 10 * HZ / 1000) -#define MAX_TIMESLICE (200 * HZ / 1000) -#define ON_RUNQUEUE_WEIGHT 30 -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS) -#define STARVATION_LIMIT (MAX_SLEEP_AVG) -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) -#define NODE_THRESHOLD 125 -#define CREDIT_LIMIT 100 +#define jiffies_since(ts) ((unsigned long)((long)jiffies - (long)(ts))) /* - * If a task is 'interactive' then we reinsert it in the active - * array after it has expired its current timeslice. (it will not - * continue to run immediately, it will still roundrobin with - * other interactive tasks.) - * - * This part scales the interactivity limit depending on niceness. - * - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. - * Here are a few examples of different nice levels: - * - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] - * - * (the X axis represents the possible -5 ... 0 ... +5 dynamic - * priority range a task can explore, a value of '1' means the - * task is rated interactive.) - * - * Ie. nice +19 tasks can never get 'interactive' enough to be - * reinserted into the active array. And only heavily CPU-hog nice -20 - * tasks will be expired. Default nice 0 tasks are somewhere between, - * it takes some effort for them to get interactive, but it's not - * too hard. + * O(1) Entitlement Based Scheduler (EBS) Utilities + */ +/* + * We need 64 bit intermediate values to prevent overflow during multiply + */ +#define EBS_MUL(a, b) \ + ((uint32_t)(((uint64_t)(a) * (uint64_t)(b)) >> EBS_OFFSET)) +#define EBS_DECAYED_FOR_TICK(v) EBS_MUL(v, ebs_decay_per_tick) +/* + * For this abbreviated fixed denominator rational number operation to be valid + * "a" must be less than or equal to EBS_ONE, "b" must be greater than or equal + * to "a" and greater than zero */ +#define EBS_MAP_SCALE(a, b) (((a) * EBS_SLOPE) / (b)) -#define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ - MAX_SLEEP_AVG) +static unsigned long ebs_half_life_ticks = EBS_INIT_HALF_LIFE_TICKS; +static uint32_t ebs_decay_per_tick = EBS_INIT_DECAY_PER_TICK; +static uint32_t ebs_incr_per_tick = EBS_INIT_INCR_PER_TICK; +#define EBS_INCR_PER_SHARE(s) (ebs_incr_per_tick / s) -#ifdef CONFIG_SMP -#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ - num_online_cpus()) -#else -#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) -#endif +/* + * This is a cached array of decay values (trading memory space for speed) + * when i > 0: ebs_decay_cache[i] == ((ebs_decay_cache[i - 1] * EBS_DECAY_PER_TICK) >> EBS_OFFSET) + * when i == 0: ebs_decay_cache[i] == EBS_DECAY_PER_TICK + * NB The bigger this array is the quicker decay calculations will be + */ +#define SCHED_DECAY_CACHE_SIZE 1024 +static uint32_t ebs_decay_cache[SCHED_DECAY_CACHE_SIZE] ____cacheline_maxaligned_in_smp; + +/* + * Make this usable for run time settable half life in a future enhancement + */ +static void init_decay_cache(unsigned long decay_per_tick) +{ + int i; -#define SCALE(v1,v1_max,v2_max) \ - (v1) * (v2_max) / (v1_max) + ebs_decay_cache[0] = decay_per_tick; + for (i = 1; i < SCHED_DECAY_CACHE_SIZE; i++) + ebs_decay_cache[i] = EBS_MUL(ebs_decay_cache[i - 1], decay_per_tick); +} -#define DELTA(p) \ - (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \ - INTERACTIVE_DELTA) +static inline uint32_t ebs_decayed_value(uint32_t val, unsigned long n) +{ + /* + * Assert: n is always greater than zero + */ + if (n <= SCHED_DECAY_CACHE_SIZE) { + return EBS_MUL(val, ebs_decay_cache[n - 1]); + } else { + unsigned long a = n / SCHED_DECAY_CACHE_SIZE; + uint64_t tmp = ebs_decay_cache[SCHED_DECAY_CACHE_SIZE - 1]; + uint32_t res = EBS_MUL(val, ebs_decay_cache[n % SCHED_DECAY_CACHE_SIZE]); + + while (1) { + if (a & 1) + res = EBS_MUL(res, tmp); + if ((0 == res) || !(a /= 2)) + break; + tmp = EBS_MUL(tmp, tmp); + } -#define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) + return res; + } +} -#define INTERACTIVE_SLEEP(p) \ - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) +/* + * A task's priority is a function of the ratio of its usage rate to its + * entitlement. Because its usage rate decays with time it will be entitled + * to promotion in the unlikely event (except on very busy systems) that it + * gets stuck on a run queue (without receiving any ticks) for long enough. + * The minimum interval for promotion to be deserved is a function of the half + * life and the number available SCHED_OTHER priority slots. A rounded value + * in ticks (for 40 SCHED_OTHER slots) is: + */ +#define EBS_PROMOTION_INTERVAL_COEFF ((uint32_t)0x9fbfc7) +static unsigned long ebs_promotion_interval = EBS_MUL(EBS_INIT_HALF_LIFE_TICKS, EBS_PROMOTION_INTERVAL_COEFF); +/* + * The number of such intervals a task must wait before promotion is a + * function of the task's actual priority. And is determined from the + * following table and associated function. The first entry is for + * EBS_MIN_PRIO, which is never promoted. + */ +#define SCHED_OTHER_SLOTS (MAX_PRIO - MAX_RT_PRIO) +static const int ebs_prom_table[SCHED_OTHER_SLOTS / 2] ____cacheline_maxaligned_in_smp = + { 1, 26, 13, 7, 5, 4, 3, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1 }; +#define EBS_NUM_PROM_INTERVALS 27 + +/* + * A task that is exceeding its hard cap needs to be "sinbinned" for a while + */ +static const uint32_t ebs_sinbin_table_coeff[SCHED_OTHER_SLOTS / 2] ____cacheline_maxaligned_in_smp = { + (uint32_t)0x9fbfc7, + (uint32_t)0x148a1b3, + (uint32_t)0x1fbc16b, + (uint32_t)0x2ba7190, + (uint32_t)0x3864aec, + (uint32_t)0x4614147, + (uint32_t)0x54dc099, + (uint32_t)0x64ed6ef, + (uint32_t)0x7687262, + (uint32_t)0x89fffc7, + (uint32_t)0x9fbc16b, + (uint32_t)0xb864aec, + (uint32_t)0xd4dc099, + (uint32_t)0xf687262, + (uint32_t)0x11fbc168, + (uint32_t)0x154dc099, + (uint32_t)0x19fbc16b, + (uint32_t)0x21fbc16b, + (uint32_t)0x43f782d6, + (uint32_t)0x87ef05ac /* should never be used */ +}; -#define HIGH_CREDIT(p) \ - ((p)->interactive_credit > CREDIT_LIMIT) +static unsigned long ebs_sinbin_table[SCHED_OTHER_SLOTS / 2] ____cacheline_maxaligned_in_smp; -#define LOW_CREDIT(p) \ - ((p)->interactive_credit < -CREDIT_LIMIT) +#define EBS_SINBIN_DURN(prio) (((prio) <= EBS_MEAN_PRI) ? 0 : \ + (ebs_sinbin_table[(prio) - (EBS_MEAN_PRI + 1)])) -#define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) +/* + * Make this usable for run time settable half life in a future enhancement + */ +static void init_sinbin_table(unsigned long half_life_ticks) +{ + int i; + + for (i = 0; i < (SCHED_OTHER_SLOTS / 2); i++) + ebs_sinbin_table[i] = EBS_MUL(ebs_sinbin_table_coeff[i], half_life_ticks); +} + +#ifdef CONFIG_SCHED_DYNAMIC_HALF_LIFE /* - * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] - * to time slice values. - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. - * - * task_timeslice() is the interface that is used by the scheduler. + * We need to ensure that half life changes are atomic */ +static spinlock_t ebs_half_life_lock = SPIN_LOCK_UNLOCKED; -#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \ - ((MAX_TIMESLICE - MIN_TIMESLICE) * \ - (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) +/* + * Require: EBS_MIN_HALF_LIFE_MSECS <= half_life_msecs <= EBS_MAX_HALF_LIFE_MSECS + */ +int set_cpu_half_life_msecs(unsigned int half_life_msecs) +{ + unsigned long new_half_life_ticks; + unsigned long new_promotion_interval; + uint32_t new_decay_per_tick; + uint32_t new_incr_per_tick; + struct task_struct *tp; -static inline unsigned int task_timeslice(task_t *p) + if ((EBS_MAX_HALF_LIFE_MSECS < half_life_msecs) || (EBS_MIN_HALF_LIFE_MSECS > half_life_msecs)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + new_half_life_ticks = ((half_life_msecs * HZ) / 1000); + new_incr_per_tick = EBS_POLY(EBS_LOG_2 / new_half_life_ticks); + new_decay_per_tick = (EBS_ONE - new_incr_per_tick); + new_promotion_interval = EBS_MUL(new_half_life_ticks, EBS_PROMOTION_INTERVAL_COEFF); /* + * We've got a fair bit to do here which we'd like to be atomic so we'll + * need some locking + */ + spin_lock(&ebs_half_life_lock); + ebs_half_life_ticks = new_half_life_ticks; + ebs_incr_per_tick = new_incr_per_tick; + ebs_decay_per_tick = new_decay_per_tick; + ebs_promotion_interval = new_promotion_interval; + init_decay_cache(ebs_decay_per_tick); + init_sinbin_table(ebs_half_life_ticks); + write_lock(&tasklist_lock); + for_each_process(tp) { + tp->cpu_incr_per_tick = EBS_INCR_PER_SHARE(tp->cpu_shares); + } + write_unlock(&tasklist_lock); + spin_unlock(&ebs_half_life_lock); + + return 0; +} + +/* + * Return the value of CPU half life in msecs + */ +unsigned int get_cpu_half_life_msecs(void) +{ + return (ebs_half_life_ticks * 1000) / HZ; +} + +EXPORT_SYMBOL(set_cpu_half_life_msecs); +EXPORT_SYMBOL(get_cpu_half_life_msecs); + +#ifdef CONFIG_PROC_FS +static int +read_cpu_half_life_fn(char *page, char **start, off_t off, int count, int *eof, void *data) +{ + int len; + + len = sprintf(page, "%u\n", get_cpu_half_life_msecs()); + + return len; +} + +static int +write_cpu_half_life_fn(struct file *file, const char __user *buffer, unsigned long count, void *data) +{ + char kbuf[32] = ""; + char *endptr = NULL; + unsigned long half_life_msecs; + int res; + + if (count > 32) + return -EFBIG; + if (copy_from_user(kbuf, buffer, count)) + return -EFAULT; + half_life_msecs = simple_strtoul(kbuf, &endptr, 0); + if ((endptr == kbuf) || (half_life_msecs == ULONG_MAX)) + return -EINVAL; + + res = set_cpu_half_life_msecs(half_life_msecs); + + return res ? res : count; +} + +static struct proc_dir_entry *cpu_half_life_file = NULL; + +void __init init_cpu_half_life_file(void) { - return BASE_TIMESLICE(p); + if (!(cpu_half_life_file = create_proc_entry("cpu_half_life", 0644, NULL))) + return; + cpu_half_life_file->read_proc = read_cpu_half_life_fn; + cpu_half_life_file->write_proc = write_cpu_half_life_fn; } +#endif +#endif /* * These are the runqueue data structures: @@ -184,12 +393,13 @@ typedef struct runqueue runqueue_t; -struct prio_array { - int nr_active; - unsigned long bitmap[BITMAP_SIZE]; - struct list_head queue[MAX_PRIO]; +struct prio_slot { + unsigned int prio; + struct list_head queue; }; +typedef struct prio_slot prio_slot_t; + /* * This is the main, per-CPU runqueue data structure. * @@ -199,12 +409,24 @@ */ struct runqueue { spinlock_t lock; - unsigned long nr_running, nr_switches, expired_timestamp, - nr_uninterruptible, timestamp_last_tick; + unsigned long nr_running; + unsigned long long nr_switches; + uint32_t eff_ent_per_share; task_t *curr, *idle; struct mm_struct *prev_mm; - prio_array_t *active, *expired, arrays[2]; - int best_expired_prio, prev_cpu_load[NR_CPUS]; + unsigned long bitmap[BITMAP_SIZE]; + prio_slot_t queues[MAX_PRIO + 1]; + prio_slot_t *current_prio_slot; + unsigned int next_prom_list; + struct list_head for_promotion[EBS_NUM_PROM_INTERVALS][SCHED_OTHER_SLOTS]; + unsigned long nr_uninterruptible; + unsigned long nr_sinbinned; + unsigned long long sinbinned_ticks; + /* + * Can't trust the timers enough to use jiffies % ebs_promotion_interval + */ + unsigned long next_prom_due; + int prev_cpu_load[NR_CPUS]; #ifdef CONFIG_NUMA atomic_t *node_nr_running; int prev_node_load[MAX_NUMNODES]; @@ -323,174 +545,226 @@ /* * Adding/removing a task to/from a priority array: */ -static inline void dequeue_task(struct task_struct *p, prio_array_t *array) +static inline void dequeue_task(struct task_struct *p) { - array->nr_active--; - list_del(&p->run_list); - if (list_empty(array->queue + p->prio)) - __clear_bit(p->prio, array->bitmap); + struct list_head *slotp = p->run_list.next; + + list_del_init(&p->prom_list); + list_del_init(&p->run_list); + if (list_empty(slotp)) + __clear_bit(list_entry(slotp, prio_slot_t, queue)->prio, task_rq(p)->bitmap); +} + +static inline void schedule_promotion(struct task_struct *p, runqueue_t *rq, int prio) +{ + if (likely((EBS_MIN_PRI < prio) && (EBS_BGND_PRI > prio))) { + int slot = prio - EBS_MIN_PRI; + int pinv = (EBS_MEAN_PRI >= prio) ? slot : EBS_BGND_PRI - prio; + + pinv = (rq->next_prom_list + ebs_prom_table[pinv]) % EBS_NUM_PROM_INTERVALS; + list_add_tail(&p->prom_list, &rq->for_promotion[pinv][slot]); + } } -static inline void enqueue_task(struct task_struct *p, prio_array_t *array) +static inline void enqueue_task(struct task_struct *p, runqueue_t *rq, int prio) { - list_add_tail(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; + list_add_tail(&p->run_list, &rq->queues[prio].queue); + __set_bit(prio, rq->bitmap); + schedule_promotion(p, rq, prio); } /* - * effective_prio - return the priority that is based on the static - * priority but is modified by bonuses/penalties. - * - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] - * into the -5 ... 0 ... +5 bonus/penalty range. - * - * We use 25% of the full 0...39 priority range so that: - * - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. - * - * Both properties are important to certain workloads. + * Update the task's CPU usage rate data to the current time */ -static int effective_prio(task_t *p) +static inline void update_cpu_rate_stats(task_t *p) { - int bonus, prio; - - if (rt_task(p)) - return p->prio; + /* cope with jiffy wrap */ + unsigned long delta = jiffies_since(p->cpu_rate_timestamp); - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; - - prio = p->static_prio - bonus; - if (prio < MAX_RT_PRIO) - prio = MAX_RT_PRIO; - if (prio > MAX_PRIO-1) - prio = MAX_PRIO-1; - return prio; + if (likely(delta)) { + p->cpu_rate_per_share = + ebs_decayed_value(p->cpu_rate_per_share, delta); + p->cpu_rate_timestamp += delta; + } } /* - * __activate_task - move a task to the runqueue. + * Effective priority for tasks in the real time classes */ -static inline void __activate_task(task_t *p, runqueue_t *rq) +static inline int rt_effective_prio(task_t *p) { - enqueue_task(p, rq->active); - nr_running_inc(rq); + return (MAX_RT_PRIO - 1) - p->rt_priority; } -static void recalc_task_prio(task_t *p, unsigned long long now) +/* + * Effective priority for tasks in the SCHED_NORMAL class wrt a particular CPU + * Assumes that the processes usages are up to date and only uses 32 bit arithmetic + */ +static inline int sched_normal_effective_prio(task_t *p, runqueue_t *rq) { - unsigned long long __sleep_time = now - p->timestamp; - unsigned long sleep_time; + uint32_t eps; - if (__sleep_time > NS_MAX_SLEEP_AVG) - sleep_time = NS_MAX_SLEEP_AVG; + if (unlikely(0 == p->cpu_rate_cap_per_share)) + return EBS_BGND_PRI; + if (unlikely(p->cpu_rate_cap_per_share < rq->eff_ent_per_share)) + eps = p->cpu_rate_cap_per_share; else - sleep_time = (unsigned long)__sleep_time; + eps = rq->eff_ent_per_share; + if (p->cpu_rate_per_share == eps) + return EBS_MEAN_PRI; + /* At least one of eps or p->cpu_rate_per_share is greater than zero */ + if (likely(p->cpu_rate_per_share < eps)) + return EBS_MIN_PRI + EBS_MAP_SCALE(p->cpu_rate_per_share, eps); + return EBS_MAX_PRI - EBS_MAP_SCALE(eps, p->cpu_rate_per_share); +} - if (likely(sleep_time > 0)) { - /* - * User tasks that sleep a long time are categorised as - * idle and will get just interactive status to stay active & - * prevent them suddenly becoming cpu hogs and starving - * other processes. - */ - if (p->mm && p->activated != -1 && - sleep_time > INTERACTIVE_SLEEP(p)) { - p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - - AVG_TIMESLICE); - if (!HIGH_CREDIT(p)) - p->interactive_credit++; - } else { - /* - * The lower the sleep avg a task has the more - * rapidly it will rise with sleep time. - */ - sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; +static inline int effective_prio(task_t *p) +{ + if (rt_task(p)) + return rt_effective_prio(p); + + return sched_normal_effective_prio(p, task_rq(p)); +} + +/* + * Assume runqueue lock is already held. + */ +static void do_promotions(runqueue_t *rq) +{ + struct list_head *tmp, *head, *tail, slice; + int new_prio, pinv, slot, idx = EBS_MIN_PRI; + + spin_lock(&rq->lock); + rq->next_prom_list = (rq->next_prom_list + 1) % EBS_NUM_PROM_INTERVALS; + for (;;) { + idx = find_next_bit(rq->bitmap, MAX_PRIO, idx + 1); + if (EBS_BGND_PRI <= idx) + break; + + tmp = rq->for_promotion[rq->next_prom_list] + (idx - EBS_MIN_PRI); + if (!list_empty(tmp)) { + new_prio = idx - 1; + head = &list_entry(tmp->next, task_t, prom_list)->run_list; + tail = &list_entry(tmp->prev, task_t, prom_list)->run_list; /* - * Tasks with low interactive_credit are limited to - * one timeslice worth of sleep avg bonus. + * Anything on the promotion list must be on + * the runqueue such that head or tail cannot + * be empty. */ - if (LOW_CREDIT(p) && - sleep_time > JIFFIES_TO_NS(task_timeslice(p))) - sleep_time = JIFFIES_TO_NS(task_timeslice(p)); + __list_extract_slice(&slice, head, tail); + __list_splice(&slice, rq->queues[new_prio].queue.prev); + if (list_empty(&rq->queues[idx].queue)) + __clear_bit(idx, rq->bitmap); + __set_bit(new_prio, rq->bitmap); /* - * Non high_credit tasks waking from uninterruptible - * sleep are limited in their sleep_avg rise as they - * are likely to be cpu hogs waiting on I/O + * If promotion occurs from the slot + * associated with rq->current_prio_slot then the + * current task will be one of those promoted + * so we should update rq->current_prio_slot */ - if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) { - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) - sleep_time = 0; - else if (p->sleep_avg + sleep_time >= - INTERACTIVE_SLEEP(p)) { - p->sleep_avg = INTERACTIVE_SLEEP(p); - sleep_time = 0; - } - } + if (idx == rq->current_prio_slot->prio) + rq->current_prio_slot = rq->queues + new_prio; /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->timestamp. The more time a - * task spends sleeping, the higher the average gets - - * and the higher the priority boost gets as well. + * A priority of EBS_MIN_PRIO cannot be + * promoted any further, but we have to move + * them somewhere or their prom_list would be + * invalid. We do not remove them from the + * list as that would be an O(n) operation. + * This will happen eventually. */ - p->sleep_avg += sleep_time; - - if (p->sleep_avg > NS_MAX_SLEEP_AVG) { - p->sleep_avg = NS_MAX_SLEEP_AVG; - if (!HIGH_CREDIT(p)) - p->interactive_credit++; - } + slot = new_prio - EBS_MIN_PRI; + pinv = (EBS_MEAN_PRI >= new_prio) ? slot : + EBS_BGND_PRI - new_prio; + pinv = (rq->next_prom_list + ebs_prom_table[pinv]) % + EBS_NUM_PROM_INTERVALS; + __list_splice(tmp, rq->for_promotion[pinv][slot].prev); + INIT_LIST_HEAD(tmp); } } + rq->next_prom_due += ebs_promotion_interval; + spin_unlock(&rq->lock); +} + +void ebs_sinbin_fn(unsigned long arg) +{ + unsigned long flags; + struct task_struct *p = (struct task_struct*)arg; + unsigned long delta; + runqueue_t *rq = task_rq_lock(p, &flags); + +/* + * Don't bother updating usage and recalculating priority completely as + * usage should have decayed to entitlement so adjust prio accordingly + * Any error due to this short cut will be self correcting + */ + p->cpu_ebs_flags &= ~EBS_CPU_RATE_CAP_SINBIN; + delta = jiffies_since(p->sinbin_timestamp); +#ifdef CONFIG_SMP + p->per_cpu_sinbin_ticks[task_cpu(p)] += delta; +#else + p->sinbin_ticks += delta; +#endif + enqueue_task(p, rq, EBS_MEAN_PRI); + nr_running_inc(rq); + rq->nr_sinbinned--; + task_rq_unlock(rq, &flags); +} - p->prio = effective_prio(p); +/* + * __activate_task - move a task to the runqueue. + */ +static inline void __activate_task(task_t *p, runqueue_t *rq, int prio) +{ + enqueue_task(p, rq, prio); + nr_running_inc(rq); +#ifdef CONFIG_SCHED_STATS + p->runnable_timestamp = jiffies; +#endif } /* * activate_task - move a task to the runqueue and do priority recalculation * - * Update all the scheduling statistics stuff. (sleep average - * calculation, priority modifiers, etc.) + * Update all the scheduling statistics stuff. */ -static inline void activate_task(task_t *p, runqueue_t *rq) +static inline int activate_task(task_t *p, runqueue_t *rq) { - unsigned long long now = sched_clock(); - - recalc_task_prio(p, now); + int prio; - /* - * This checks to make sure it's not an uninterruptible task - * that is now waking up. - */ - if (!p->activated) { + if (likely(!rt_task(p))) { /* - * Tasks which were woken up by interrupts (ie. hw events) - * are most likely of interactive nature. So we give them - * the credit of extending their sleep time to the period - * of time they spend on the runqueue, waiting for execution - * on a CPU, first time around: + * This task is probably hasn't been on the CPU for a while + * so update its usage rate to get a more relevant priority */ - if (in_interrupt()) - p->activated = 2; - else { - /* - * Normal first-time wakeups get a credit too for - * on-runqueue time, but it will be weighted down: - */ - p->activated = 1; - } - } - p->timestamp = now; + update_cpu_rate_stats(p); + p->time_slice = task_timeslice(p); + prio = sched_normal_effective_prio(p, rq); + } else + prio = rt_effective_prio(p); + __activate_task(p, rq, prio); + return prio; +} - __activate_task(p, rq); +#ifdef CONFIG_SCHED_STATS +/* + * Update statistics on task deactivation. + */ +static inline void update_cpu_stats_on_deactivation(task_t *p) +{ + unsigned long delta_runnable = jiffies_since(p->runnable_timestamp); + +#ifdef CONFIG_SMP + p->per_cpu_runnable[task_cpu(p)] += delta_runnable; + p->per_cpu_slices[task_cpu(p)]++; +#else + p->runnable += delta_runnable; + p->slices++; +#endif } +#endif /* * deactivate_task - remove a task from the runqueue. @@ -500,8 +774,10 @@ nr_running_dec(rq); if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; - dequeue_task(p, p->array); - p->array = NULL; + dequeue_task(p); +#ifdef CONFIG_SCHED_STATS + update_cpu_stats_on_deactivation(p); +#endif } /* @@ -567,7 +843,7 @@ * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ - if (!p->array && !task_running(rq, p)) { + if (list_empty(&p->run_list) && !task_running(rq, p)) { set_task_cpu(p, any_online_cpu(p->cpus_allowed)); return 0; } @@ -662,7 +938,7 @@ rq = task_rq_lock(p, &flags); old_state = p->state; if (old_state & state) { - if (!p->array) { + if (list_empty(&p->run_list)) { /* * Fast-migrate the task if it's not running or runnable * currently. Do not violate hard affinity. @@ -676,19 +952,13 @@ task_rq_unlock(rq, &flags); goto repeat_lock_task; } - if (old_state == TASK_UNINTERRUPTIBLE) { + if (old_state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; - /* - * Tasks on involuntary sleep don't earn - * sleep_avg beyond just interactive state. - */ - p->activated = -1; - } - if (sync && (task_cpu(p) == smp_processor_id())) - __activate_task(p, rq); - else { - activate_task(p, rq); - if (TASK_PREEMPTS_CURR(p, rq)) + if (sync && (task_cpu(p) == smp_processor_id())) { + p->time_slice = task_timeslice(p); + __activate_task(p, rq, effective_prio(p)); + } else { + if (TASK_PREEMPTS_CURR(activate_task(p, rq), rq)) resched_task(rq->curr); } success = 1; @@ -716,7 +986,7 @@ * Perform scheduler related setup for a newly forked process p. * p is forked by current. */ -void sched_fork(task_t *p) +void sched_fork(task_t *p, unsigned long clone_flags) { /* * We mark the process as running here, but have not actually @@ -724,10 +994,32 @@ * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ + init_timer(&p->sinbin_timer); + p->sinbin_timer.data = (unsigned long) p; p->state = TASK_RUNNING; INIT_LIST_HEAD(&p->run_list); - p->array = NULL; + INIT_LIST_HEAD(&p->prom_list); spin_lock_init(&p->switch_lock); +#ifdef CONFIG_SMP + { + int i; + + for (i = 0; i < NR_CPUS; i++) { + p->per_cpu_sinbin_ticks[i] = 0; +#ifdef CONFIG_SCHED_STATS + p->per_cpu_utime[i] = 0; + p->per_cpu_stime[i] = 0; + p->per_cpu_runnable[i] = 0; + p->per_cpu_slices[i] = 0; +#endif + } + } +#else + p->sinbin_ticks = 0; +#ifdef CONFIG_SCHED_STATS + p->slices = p->runnable = 0; +#endif +#endif #ifdef CONFIG_PREEMPT /* * During context-switch we hold precisely one spinlock, which @@ -738,32 +1030,18 @@ p->thread_info->preempt_count = 1; #endif /* - * Share the timeslice between parent and child, thus the - * total amount of pending timeslices in the system doesn't change, - * resulting in more scheduling fairness. + * To mollify ramp up effect to some extent (in particular, the swamping + * of a parent by its children we'll leave the child with the same CPU + * usage rate and timestamp as its parent UNLESS a reset is specifically + * requested (e.g. for kernel threads). This should not adversely + * effect interactive tasks as they are generally launched by other + * interactive tasks with inherently low usage. */ - local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; + if (CLONE_RESET_CPU_USAGE & clone_flags) + p->cpu_rate_per_share = 0; /* - * The remainder of the first timeslice might be recovered by - * the parent if the child exits early enough. + * New tasks will be given a new timeslice when they become active */ - p->first_time_slice = 1; - current->time_slice >>= 1; - p->timestamp = sched_clock(); - if (!current->time_slice) { - /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the - * runqueue lock is not a problem. - */ - current->time_slice = 1; - preempt_disable(); - scheduler_tick(0, 0); - local_irq_enable(); - preempt_enable(); - } else - local_irq_enable(); } /* @@ -780,63 +1058,48 @@ BUG_ON(p->state != TASK_RUNNING); /* - * We decrease the sleep average of forking parents - * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. + * Give new tasks a complete time slice + */ + p->time_slice = task_timeslice(p); + /* + * The child has had no ticks yet + */ + p->cpu_ebs_flags &= ~EBS_NEEDS_PRIO_RECALC; + /* + * To mollify ramp up effect to some extent (in particular, the swamping + * of a parent by its children we'll leave the child with the same CPU + * usage rate and timestamp as its parent. This should not adversely + * effect interactive tasks as they are generally launched by other + * interactive tasks with inherently low usage. */ - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - - p->interactive_credit = 0; - - p->prio = effective_prio(p); set_task_cpu(p, smp_processor_id()); - if (unlikely(!current->array)) - __activate_task(p, rq); + /* + * Now that the idle task is back on the run queue we need extra care + * to make sure that its one and only fork() doesn't end up in the idle + * priority slot. Just testing for empty run list is no longer adequate. + */ + if (unlikely(list_empty(¤t->run_list) || (MAX_PRIO == rq->current_prio_slot->prio))) { + if (unlikely(rt_task(p))) + __activate_task(p, rq, rt_effective_prio(p)); + else + __activate_task(p, rq, EBS_MIN_PRI); + } else { - p->prio = current->prio; + /* + * Put the child on the same list as (but ahead of) the parent + */ list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; + if (likely(!list_empty(¤t->prom_list))) + list_add_tail(&p->prom_list, ¤t->prom_list); nr_running_inc(rq); +#ifdef CONFIG_SCHED_STATS + p->runnable_timestamp = jiffies; +#endif } task_rq_unlock(rq, &flags); } -/* - * Potentially available exiting-child timeslices are - * retrieved here - this way the parent does not get - * penalized for creating too many threads. - * - * (this cannot be used to 'generate' timeslices - * artificially, because any timeslice recovered here - * was given away by the parent in the first place.) - */ -void sched_exit(task_t * p) -{ - unsigned long flags; - - local_irq_save(flags); - if (p->first_time_slice) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > MAX_TIMESLICE)) - p->parent->time_slice = MAX_TIMESLICE; - } - local_irq_restore(flags); - /* - * If the child was a (relative-) CPU hog then decrease - * the sleep_avg of the parent as well. - */ - if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / - (EXIT_WEIGHT + 1); -} - /** * finish_task_switch - clean up after a task-switch * @prev: the thread we just switched away from. @@ -1238,26 +1501,50 @@ return busiest; } +#ifdef CONFIG_SCHED_STATS +/* + * Update statistics on task migration. + */ +static inline void update_cpu_stats_on_migration(task_t *p) +{ + unsigned long delta_runnable = jiffies_since(p->runnable_timestamp); + + p->runnable_timestamp += delta_runnable; + p->per_cpu_runnable[task_cpu(p)] += delta_runnable; +} +#endif + /* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ static inline -void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, - runqueue_t *this_rq, int this_cpu) +void pull_task(runqueue_t *src_rq, task_t *p, runqueue_t *this_rq, int this_cpu) { - dequeue_task(p, src_array); + int prio; + + dequeue_task(p); nr_running_dec(src_rq); +#ifdef CONFIG_SCHED_STATS + update_cpu_stats_on_migration(p); +#endif set_task_cpu(p, this_cpu); nr_running_inc(this_rq); - enqueue_task(p, this_rq->active); - p->timestamp = sched_clock() - - (src_rq->timestamp_last_tick - p->timestamp); + if (unlikely(rt_task(p))) + prio = rt_effective_prio(p); + else { + /* + * task priority depends on the CPU as well as the task itself + */ + update_cpu_rate_stats(p); + prio = sched_normal_effective_prio(p, this_rq); + } + enqueue_task(p, this_rq, prio); /* * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - if (TASK_PREEMPTS_CURR(p, this_rq)) + if (TASK_PREEMPTS_CURR(prio, this_rq)) set_need_resched(); } @@ -1267,8 +1554,6 @@ static inline int can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle) { - unsigned long delta = rq->timestamp_last_tick - tsk->timestamp; - /* * We do not migrate tasks that are: * 1) running (obviously), or @@ -1279,7 +1564,7 @@ return 0; if (!cpu_isset(this_cpu, tsk->cpus_allowed)) return 0; - if (!idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks))) + if (!idle && (jiffies_since(tsk->mem_cache_timestamp) <= cache_decay_ticks)) return 0; return 1; } @@ -1296,7 +1581,6 @@ { int imbalance, idx, this_cpu = smp_processor_id(); runqueue_t *busiest; - prio_array_t *array; struct list_head *head, *curr; task_t *tmp; @@ -1311,34 +1595,17 @@ */ imbalance /= 2; - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (busiest->expired->nr_active) - array = busiest->expired; - else - array = busiest->active; - -new_array: /* Start searching at priority 0: */ idx = 0; skip_bitmap: if (!idx) - idx = sched_find_first_bit(array->bitmap); + idx = sched_find_first_bit(busiest->bitmap); else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == busiest->expired) { - array = busiest->active; - goto new_array; - } + idx = find_next_bit(busiest->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) goto out_unlock; - } - head = array->queue + idx; + head = &busiest->queues[idx].queue; curr = head->prev; skip_queue: tmp = list_entry(curr, task_t, run_list); @@ -1351,7 +1618,7 @@ idx++; goto skip_bitmap; } - pull_task(busiest, array, tmp, this_rq, this_cpu); + pull_task(busiest, tmp, this_rq, this_cpu); /* Only migrate one task if we are idle */ if (!idle && --imbalance) { @@ -1448,22 +1715,6 @@ EXPORT_PER_CPU_SYMBOL(kstat); /* - * We place interactive tasks back into the active array, if possible. - * - * To guarantee that this does not starve expired tasks we ignore the - * interactivity of a task if the first expired task had to wait more - * than a 'reasonable' amount of time. This deadline timeout is - * load-dependent, as the frequency of array switched decreases with - * increasing number of running tasks. We also ignore the interactivity - * if a better static_prio task has expired: - */ -#define EXPIRED_STARVING(rq) \ - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ - ((rq)->curr->static_prio > (rq)->best_expired_prio)) - -/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. * @@ -1477,8 +1728,6 @@ runqueue_t *rq = this_rq(); task_t *p = current; - rq->timestamp_last_tick = sched_clock(); - if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_ticks); @@ -1492,6 +1741,21 @@ } if (p == rq->idle) { + /* + * Decay the effective entitlement per share for this CPU + * Is this lock necessary? + */ + spin_lock(&rq->lock); + rq->eff_ent_per_share = EBS_DECAYED_FOR_TICK(rq->eff_ent_per_share); + /* + * There should be no tasks to promote so just update where we're + * up to + */ + if (unlikely(time_after_eq(jiffies, rq->next_prom_due))) { + rq->next_prom_list = (rq->next_prom_list + 1) % EBS_NUM_PROM_INTERVALS; + rq->next_prom_due += ebs_promotion_interval; + } + spin_unlock(&rq->lock); if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait += sys_ticks; else @@ -1505,13 +1769,15 @@ cpustat->user += user_ticks; cpustat->system += sys_ticks; - /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq->active) { - set_tsk_need_resched(p); - goto out; - } +#ifdef CONFIG_SCHED_STATS + cpustat->runnable += rq->nr_running; +#endif spin_lock(&rq->lock); /* + * Decay the effective entitlement per share for this CPU + */ + rq->eff_ent_per_share = EBS_DECAYED_FOR_TICK(rq->eff_ent_per_share); + /* * The task was running during this tick - update the * time slice counter. Note: we do not update a thread's * priority until it either goes to sleep or uses up its @@ -1525,62 +1791,36 @@ */ if ((p->policy == SCHED_RR) && !--p->time_slice) { p->time_slice = task_timeslice(p); - p->first_time_slice = 0; set_tsk_need_resched(p); /* put it at the end of the queue: */ - dequeue_task(p, rq->active); - enqueue_task(p, rq->active); + dequeue_task(p); + enqueue_task(p, rq, rq->current_prio_slot->prio); } goto out_unlock; } - if (!--p->time_slice) { - dequeue_task(p, rq->active); - set_tsk_need_resched(p); - p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { - enqueue_task(p, rq->expired); - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; - } else - enqueue_task(p, rq->active); + if (EBS_NEEDS_PRIO_RECALC & p->cpu_ebs_flags) { + p->cpu_rate_timestamp = jiffies; + p->cpu_rate_per_share = EBS_DECAYED_FOR_TICK(p->cpu_rate_per_share); } else { /* - * Prevent a too long timeslice allowing a task to monopolize - * the CPU. We do this by splitting up the timeslice into - * smaller pieces. - * - * Note: this does not mean the task's timeslices expire or - * get lost in any way, they just might be preempted by - * another task of equal priority. (one with higher - * priority would have preempted this task already.) We - * requeue this task to the end of the list on this priority - * level, which is in essence a round-robin of tasks with - * equal priority. - * - * This only applies to tasks in the interactive - * delta range with at least TIMESLICE_GRANULARITY to requeue. + * This is the first tick this time slice so update CPU usage + * rate stats to allow simplified per tick update on future + * ticks. */ - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - - p->time_slice) % TIMESLICE_GRANULARITY(p)) && - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { - - dequeue_task(p, rq->active); - set_tsk_need_resched(p); - p->prio = effective_prio(p); - enqueue_task(p, rq->active); - } + update_cpu_rate_stats(p); + p->cpu_ebs_flags |= EBS_NEEDS_PRIO_RECALC; } + p->cpu_rate_per_share += p->cpu_incr_per_tick; + if (likely(p->cpu_rate_per_share <= p->cpu_rate_cap_per_share) && (p->cpu_rate_per_share > rq->eff_ent_per_share)) + rq->eff_ent_per_share = p->cpu_rate_per_share; + if (!--p->time_slice) + set_tsk_need_resched(p); out_unlock: spin_unlock(&rq->lock); -out: rebalance_tick(rq, 0); + if (unlikely(time_after_eq(jiffies, rq->next_prom_due))) + do_promotions(rq); } void scheduling_functions_start_here(void) { } @@ -1593,11 +1833,7 @@ long *switch_count; task_t *prev, *next; runqueue_t *rq; - prio_array_t *array; - struct list_head *queue; - unsigned long long now; - unsigned long run_time; - int idx; + unsigned long now = jiffies; /* * Test if we are atomic. Since do_exit() needs to call into @@ -1617,19 +1853,6 @@ rq = this_rq(); release_kernel_lock(prev); - now = sched_clock(); - if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) - run_time = now - prev->timestamp; - else - run_time = NS_MAX_SLEEP_AVG; - - /* - * Tasks with interactive credits get charged less run_time - * at high sleep_avg to delay them losing their interactive - * status - */ - if (HIGH_CREDIT(prev)) - run_time /= (CURRENT_BONUS(prev) ? : 1); spin_lock_irq(&rq->lock); @@ -1638,69 +1861,65 @@ * to picking the next task. */ switch_count = &prev->nivcsw; - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - switch_count = &prev->nvcsw; - if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - unlikely(signal_pending(prev)))) - prev->state = TASK_RUNNING; - else - deactivate_task(prev, rq); - } - - if (unlikely(!rq->nr_running)) { -#ifdef CONFIG_SMP - load_balance(rq, 1, cpu_to_node_mask(smp_processor_id())); -#endif - if (!rq->nr_running) { - next = rq->idle; - rq->expired_timestamp = 0; - goto switch_tasks; + if (likely(prev->state)) { + if (preempt_count() & PREEMPT_ACTIVE) { + prev->cpu_ebs_flags &= ~EBS_NEEDS_PRIO_RECALC; + } else { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else { + /* + * Priority will be calculated when the task + * is reactivated + */ + prev->cpu_ebs_flags &= ~EBS_NEEDS_PRIO_RECALC; + deactivate_task(prev, rq); + } } } - array = rq->active; - if (unlikely(!array->nr_active)) { + /* + * This test will always fail for idle and real time tasks + */ + if (unlikely(EBS_NEEDS_PRIO_RECALC & prev->cpu_ebs_flags)) { + prev->cpu_ebs_flags &= ~EBS_NEEDS_PRIO_RECALC; + dequeue_task(prev); + prev->time_slice = task_timeslice(prev); /* - * Switch the active and expired arrays. + * no need to update usage rate stats as we've just come off a CPU */ - rq->active = rq->expired; - rq->expired = array; - array = rq->active; - rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; - } - - idx = sched_find_first_bit(array->bitmap); - queue = array->queue + idx; - next = list_entry(queue->next, task_t, run_list); - - if (next->activated > 0) { - unsigned long long delta = now - next->timestamp; - - if (next->activated == 1) - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - - array = next->array; - dequeue_task(next, array); - recalc_task_prio(next, next->timestamp + delta); - enqueue_task(next, array); + rq->current_prio_slot = rq->queues + sched_normal_effective_prio(prev, rq); + if (unlikely(EBS_CPU_RATE_CAP_IS_HARD & prev->cpu_ebs_flags)) { + unsigned int sbt = EBS_SINBIN_DURN(rq->current_prio_slot->prio); + + if (sbt) { + nr_running_dec(rq); + prev->cpu_ebs_flags |= EBS_CPU_RATE_CAP_SINBIN; + prev->sinbin_timestamp = now; + rq->nr_sinbinned++; + prev->sinbin_timer.expires = now + sbt; + add_timer(&prev->sinbin_timer); + } + else + enqueue_task(prev, rq, rq->current_prio_slot->prio); + } else + enqueue_task(prev, rq, rq->current_prio_slot->prio); } - next->activated = 0; -switch_tasks: + +#ifdef CONFIG_SMP + if (unlikely(!rq->nr_running)) + load_balance(rq, 1, cpu_to_node_mask(smp_processor_id())); + prev->mem_cache_timestamp = now; +#endif + rq->current_prio_slot = rq->queues + sched_find_first_bit(rq->bitmap); + next = list_entry(rq->current_prio_slot->queue.next, task_t, run_list); prefetch(next); clear_tsk_need_resched(prev); RCU_qsctr(task_cpu(prev))++; - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) { - prev->sleep_avg = 0; - if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) - prev->interactive_credit--; - } - prev->timestamp = now; - if (likely(prev != next)) { - next->timestamp = now; rq->nr_switches++; rq->curr = next; ++*switch_count; @@ -1961,12 +2180,234 @@ void scheduling_functions_end_here(void) { } +/* + * A little bit of long division + */ +#define EBS_UINT32_MAX ((uint32_t)0xffffffff) +#define EBS_UINT32_HIBIT ((uint32_t)0x80000000) +static inline uint32_t cap_fm_fraction(uint32_t x, uint32_t y) +{ + uint32_t res = 0; + int k = 0; + + while (x && (k < EBS_OFFSET)) { + uint32_t term; + int j; + + for (; (k < EBS_OFFSET) && !(x & EBS_UINT32_HIBIT); k++) + x <<= 1; + term = x / y; + for (j = 0; j < (EBS_OFFSET - k); j++) + term <<= 1; + res += term; + x %= y; + } + + return res; +} + +/* + * Require: (0x7fffffff >= den > 0) and (enu <= den and ((den != 0) or (not hard)) + */ +int set_cpu_rate_cap_fm_frac(struct task_struct *p, uint32_t enu, uint32_t den, int hard) +{ + /* + * Division by zero or too big a denominator will break the long division routine + * The fraction must represent a real number in the range 0 to 1 + */ + if (!den || (den > 0x7fffffff) || (den < enu)) + return -EDOM; + + return set_cpu_rate_cap(p, cap_fm_fraction(enu, den), hard); +} + +/* + * Require: 0 <= cap <= EBS_ONE and ((cap != 0) or (not hard)) + */ +int set_cpu_rate_cap(struct task_struct *p, uint32_t new_cap, int hard) +{ + int is_allowed; + unsigned long flags; + int requeue_required; + runqueue_t *rq; + int delta; + + if ((new_cap > EBS_ONE) || (hard && !new_cap)) /* zero hard caps are not allowed */ + return -EINVAL; + is_allowed = capable(CAP_SYS_NICE); + /* + * We have to be careful, if called from /proc code, + * the task might be in the middle of scheduling on another CPU. + */ + rq = task_rq_lock(p, &flags); + if (!is_allowed) { + /* + * Ordinary users can set/change caps on their own tasks provided + * that the new setting is MORE constraining + */ + if (((current->euid != p->uid) && (current->uid != p->uid)) || + (new_cap > p->cpu_rate_cap) || + (!hard && (EBS_CPU_RATE_CAP_IS_HARD & p->cpu_ebs_flags))) + { + task_unlock(p); + return -EPERM; + } + } + /* + * The RT tasks don't have caps, but we still allow the caps to be + * set - but as expected it wont have any effect on scheduling until the + * task becomes SCHED_NORMAL: + */ + requeue_required = (!rt_task(p)) && (!list_empty(&p->run_list)); + if (requeue_required) + dequeue_task(p); + delta = new_cap - p->cpu_rate_cap; + p->cpu_rate_cap = new_cap; + p->cpu_rate_cap_per_share = (p->cpu_rate_cap / p->cpu_shares); + if (hard) + p->cpu_ebs_flags |= EBS_CPU_RATE_CAP_IS_HARD; + else + p->cpu_ebs_flags &= ~EBS_CPU_RATE_CAP_IS_HARD; + if (requeue_required) { + /* we won't be in here if it's a real time task */ + int new_prio = sched_normal_effective_prio(p, rq); + + enqueue_task(p, rq, new_prio); + if (task_running(rq, p)) + rq->current_prio_slot = rq->queues + new_prio; + /* + * If the task increased its cap or is running and + * lowered its cap, then reschedule its CPU: + */ + if ((delta > 0) || ((delta < 0) && task_running(rq, p))) + resched_task(rq->curr); + } + task_rq_unlock(rq, &flags); + return 0; +} + +EXPORT_SYMBOL(set_cpu_rate_cap); + +static inline int shares_to_nice(unsigned int shares) +{ +#define SQRT_LOOP(valid) \ + do { \ + uint32_t temp; \ + \ + if (rmdr >= (temp = (((res << 1) + b) << bshft--))) { \ + res += b; \ + rmdr -= temp; \ + } \ + \ + b >>= 1; \ + } while (valid) + + uint32_t res = 0; + uint32_t b = 0x00008000; + int bshft = 15; + uint32_t rmdr; + + if (shares <= 20) + return (20 - shares); + + rmdr = ((shares - 20) << 16); + SQRT_LOOP(((rmdr > 0xffff) && b)); + SQRT_LOOP(rmdr && (bshft > 7)); + /* + * It should be safe to multiply by the square root of the denominator now + */ + + res <<= 8; + /* + * If the remainder is zero there's no sense going on + */ + if (!rmdr) + return (res); + + rmdr <<= 16; + b <<= 8; + bshft += 8; + + SQRT_LOOP(rmdr && b); + + return ((res + 0x00008000) >> 16); +} + +/* + * Require: 1 <= shares <= EBS_MAX_SHARES + */ +int set_cpu_shares(struct task_struct *p, unsigned int shares) +{ + int is_allowed; + unsigned long flags; + int requeue_required; + runqueue_t *rq; + int delta; + + if ((EBS_MAX_SHARES < shares) || (1 > shares)) + return -EINVAL; + is_allowed = capable(CAP_SYS_NICE); + /* + * We have to be careful, if called from /proc code, + * the task might be in the middle of scheduling on another CPU. + */ + rq = task_rq_lock(p, &flags); + if (!is_allowed) { + /* + * Ordinary users can set/change shares on their own tasks provided + * that the new setting is less than their current shares + */ + if (((current->euid != p->uid) && (current->uid != p->uid)) || (shares > p->cpu_shares)) + { + task_rq_unlock(rq, &flags); + return -EPERM; + } + } + /* + * The RT tasks don't have shares, but we still allow the shares to be + * set - but as expected it wont have any effect on scheduling until the + * task becomes SCHED_NORMAL: + */ + requeue_required = (!rt_task(p)) && (!list_empty(&p->run_list)); + if (requeue_required) + dequeue_task(p); + delta = shares - p->cpu_shares; + p->cpu_rate_per_share *= p->cpu_shares; + p->cpu_shares = shares; + p->cpu_rate_per_share /= p->cpu_shares; + p->cpu_incr_per_tick = EBS_INCR_PER_SHARE(p->cpu_shares); + p->cpu_rate_cap_per_share = (p->cpu_rate_cap / p->cpu_shares); + /* + * Set nice to the nearest value so that reported nice reflects shares + * to some degree + */ + p->nice = shares_to_nice(shares); + if (requeue_required) { + /* we won't be in here if it's a real time task */ + int new_prio = sched_normal_effective_prio(p, rq); + + enqueue_task(p, rq, new_prio); + if (task_running(rq, p)) + rq->current_prio_slot = rq->queues + new_prio; + /* + * If the task increased its shares or is running and + * lowered its shares, then reschedule its CPU: + */ + if ((delta > 0) || ((delta < 0) && task_running(rq, p))) + resched_task(rq->curr); + } + task_rq_unlock(rq, &flags); + return 0; +} + +EXPORT_SYMBOL(set_cpu_shares); + void set_user_nice(task_t *p, long nice) { unsigned long flags; - prio_array_t *array; + int requeue_required; runqueue_t *rq; - int old_prio, new_prio, delta; + int delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -1981,22 +2422,25 @@ * it wont have any effect on scheduling until the task is * not SCHED_NORMAL: */ - if (rt_task(p)) { - p->static_prio = NICE_TO_PRIO(nice); - goto out_unlock; - } - array = p->array; - if (array) - dequeue_task(p, array); - - old_prio = p->prio; - new_prio = NICE_TO_PRIO(nice); - delta = new_prio - old_prio; - p->static_prio = NICE_TO_PRIO(nice); - p->prio += delta; - - if (array) { - enqueue_task(p, array); + requeue_required = (!rt_task(p)) && (!list_empty(&p->run_list)); + if (requeue_required) + dequeue_task(p); + + delta = nice - p->nice; + p->nice = nice; + p->cpu_rate_per_share *= p->cpu_shares; + p->cpu_shares = EBS_NICE_TO_SHARES(nice); + p->cpu_rate_per_share /= p->cpu_shares; + p->cpu_incr_per_tick = EBS_INCR_PER_SHARE(p->cpu_shares); + p->cpu_rate_cap_per_share = (p->cpu_rate_cap / p->cpu_shares); + + if (requeue_required) { + /* we won't be in here if it's a real time task */ + int new_prio = sched_normal_effective_prio(p, rq); + + enqueue_task(p, rq, new_prio); + if (task_running(rq, p)) + rq->current_prio_slot = rq->queues + new_prio; /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -2004,7 +2448,6 @@ if (delta < 0 || (delta > 0 && task_running(rq, p))) resched_task(rq->curr); } -out_unlock: task_rq_unlock(rq, &flags); } @@ -2038,7 +2481,7 @@ if (increment > 40) increment = 40; - nice = PRIO_TO_NICE(current->static_prio) + increment; + nice = current->nice + increment; if (nice < -20) nice = -20; if (nice > 19) @@ -2064,7 +2507,24 @@ */ int task_prio(task_t *p) { - return p->prio - MAX_RT_PRIO; + + int prio; + + if (unlikely(rt_task(p))) + prio = rt_effective_prio(p); + else { + unsigned long flags; + /* + * This function is called outside locks so we'll do the honours + */ + runqueue_t *rq = task_rq_lock(p, &flags); + + update_cpu_rate_stats(p); + prio = sched_normal_effective_prio(p, rq); + task_rq_unlock(rq, &flags); + } + + return prio - MAX_RT_PRIO; } /** @@ -2105,8 +2565,7 @@ { struct sched_param lp; int retval = -EINVAL; - int oldprio; - prio_array_t *array; + int is_on_runq; unsigned long flags; runqueue_t *rq; task_t *p; @@ -2166,28 +2625,27 @@ if (retval) goto out_unlock; - array = p->array; - if (array) + is_on_runq = !list_empty(&p->run_list); + if (is_on_runq) deactivate_task(p, task_rq(p)); retval = 0; - p->policy = policy; + if ((p->policy = policy) != SCHED_NORMAL) + p->cpu_ebs_flags &= ~EBS_NEEDS_PRIO_RECALC; p->rt_priority = lp.sched_priority; - oldprio = p->prio; - if (policy != SCHED_NORMAL) - p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; - else - p->prio = p->static_prio; - if (array) { - __activate_task(p, task_rq(p)); + if (is_on_runq) { + int prio = effective_prio(p); + + __activate_task(p, task_rq(p), prio); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (task_running(rq, p)) { - if (p->prio > oldprio) + if (rq->curr == p) { + if (prio > rq->current_prio_slot->prio) resched_task(rq->curr); - } else if (p->prio < rq->curr->prio) + rq->current_prio_slot = rq->queues + prio; + } else if (TASK_PREEMPTS_CURR(prio, rq)) resched_task(rq->curr); } @@ -2375,28 +2833,36 @@ /** * sys_sched_yield - yield the current processor to other threads. * - * this function yields the current CPU by moving the calling thread - * to the expired array. If there are no other threads running on this - * CPU then this function will return. + * If there are no other threads running on this CPU then this function will + * return. */ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); - prio_array_t *array = current->array; /* - * We implement yielding by moving the task into the expired - * queue. - * * (special rule: RT tasks will just roundrobin in the active * array.) */ if (likely(!rt_task(current))) { - dequeue_task(current, array); - enqueue_task(current, rq->expired); + /* If there's other tasks on this CPU make sure that as many of + * them as possible/judicious get some CPU before this task + */ + dequeue_task(current); + rq->current_prio_slot = rq->queues + sched_normal_effective_prio(current, rq); + if (likely(EBS_BGND_PRI > rq->current_prio_slot->prio)) { + int next_prio = sched_find_first_bit(rq->bitmap); + + if ((MAX_PRIO == next_prio) || (EBS_BGND_PRI > next_prio)) + rq->current_prio_slot = rq->queues + EBS_MAX_PRI; + else + rq->current_prio_slot = rq->queues + EBS_BGND_PRI; + } + enqueue_task(current, rq, rq->current_prio_slot->prio); + current->cpu_ebs_flags &= ~EBS_NEEDS_PRIO_RECALC; } else { - list_del(¤t->run_list); - list_add_tail(¤t->run_list, array->queue + current->prio); + list_del_init(¤t->run_list); + list_add_tail(¤t->run_list, &rq->current_prio_slot->queue); } /* * Since we are going to call schedule() anyway, there's @@ -2648,12 +3114,32 @@ local_irq_save(flags); double_rq_lock(idle_rq, rq); + /* + * Make sure that we don't accidentally change the idle task's priority + * during schedule() + */ + idle->cpu_ebs_flags = EBS_FLAGS_DEFAULT; idle_rq->curr = idle_rq->idle = idle; deactivate_task(idle, rq); - idle->array = NULL; - idle->prio = MAX_PRIO; + /* + * Initialising the prom_list enables us to use list_del_init() + * on any task without the overhead of checking whether OK to do so + */ + INIT_LIST_HEAD(&idle->prom_list); + /* + * Should be no need to initialise other EBS fields as they shouldn't be used + */ idle->state = TASK_RUNNING; set_task_cpu(idle, cpu); + /* + * Putting the idle process onto a run queue simplifies the selection of + * the next task to run in schedule(). + */ + list_add_tail(&idle->run_list, &idle_rq->queues[MAX_PRIO].queue); + /* + * The idle task is the current task on idle_rq + */ + idle_rq->current_prio_slot = idle_rq->queues + MAX_PRIO; double_rq_unlock(idle_rq, rq); set_tsk_need_resched(idle); local_irq_restore(flags); @@ -2732,14 +3218,18 @@ if (task_cpu(p) != smp_processor_id()) goto out; /* Already moved */ - set_task_cpu(p, dest_cpu); - if (p->array) { + if (!list_empty(&p->run_list)) { deactivate_task(p, this_rq()); - activate_task(p, rq_dest); - if (p->prio < rq_dest->curr->prio) + /* + * Do set_task_cpu() AFTER we dequeue the task, since + * dequeue_task() relies on task_cpu() always being accurate. + */ + set_task_cpu(p, dest_cpu); + if (TASK_PREEMPTS_CURR(activate_task(p, rq_dest), rq_dest)) resched_task(rq_dest->curr); + } else { + set_task_cpu(p, dest_cpu); } - p->timestamp = rq_dest->timestamp_last_tick; out: double_rq_unlock(this_rq(), rq_dest); @@ -2906,32 +3396,38 @@ void __init sched_init(void) { runqueue_t *rq; - int i, j, k; + int i, k; /* Init the kstat counters */ init_kstat(); + init_decay_cache(ebs_decay_per_tick); + init_sinbin_table(ebs_half_life_ticks); for (i = 0; i < NR_CPUS; i++) { - prio_array_t *array; - rq = cpu_rq(i); - rq->active = rq->arrays; - rq->expired = rq->arrays + 1; - rq->best_expired_prio = MAX_PRIO; - spin_lock_init(&rq->lock); INIT_LIST_HEAD(&rq->migration_queue); atomic_set(&rq->nr_iowait, 0); nr_running_init(rq); - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); + for (k = 0; k <= MAX_PRIO; k++) { + rq->queues[k].prio = k; + INIT_LIST_HEAD(&rq->queues[k].queue); + __clear_bit(k, rq->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, rq->bitmap); + for (k = 0; k < EBS_NUM_PROM_INTERVALS; k++) { + int j; + + for (j = 0; j < SCHED_OTHER_SLOTS; j++) + INIT_LIST_HEAD(&rq->for_promotion[k][j]); } + rq->next_prom_list = 0; + rq->eff_ent_per_share = 1; /* as small as possible without being zero */ + rq->nr_sinbinned = 0; + rq->sinbinned_ticks = 0; + rq->current_prio_slot = rq->queues + (MAX_PRIO - 20); + rq->next_prom_due = jiffies + ebs_promotion_interval; } /* * We have to do a little magic to get the first @@ -3018,3 +3514,172 @@ EXPORT_SYMBOL(__preempt_write_lock); #endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */ + +#ifdef CONFIG_SCHED_STATS +struct task_cpu_stats { + unsigned long long utime; + unsigned long long stime; + unsigned long long runnable; + unsigned long long sinbinned; + unsigned long long slices; +}; + +/* + * Invoked whenever /proc//cpu is read. + */ +int proc_pid_cpu(struct task_struct *p, char *buffer) +{ + unsigned long delta_q = jiffies_since(p->runnable_timestamp); + u64 now = jiffies_64_to_clock_t(get_jiffies_64() - INITIAL_JIFFIES); + struct task_cpu_stats total; +#ifdef CONFIG_SMP + struct task_cpu_stats cpu[NR_CPUS]; + int i; +#endif + unsigned long long runnable; + int len; + +#ifdef CONFIG_SMP + /* Take the sample as quickly as possible to maximise validity */ + for (i = 0 ; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + cpu[i].utime = jiffies_64_to_clock_t(p->per_cpu_utime[i]); + cpu[i].stime = jiffies_64_to_clock_t(p->per_cpu_stime[i]); + cpu[i].runnable = jiffies_64_to_clock_t(p->per_cpu_runnable[i]); + cpu[i].sinbinned = jiffies_to_clock_t(p->per_cpu_sinbin_ticks[i]); + cpu[i].slices = p->per_cpu_slices[i]; + } + + /* be noncommital about status of current tick */ + if ((delta_q > 1) && !list_empty(&p->run_list)) + cpu[task_cpu(p)].runnable += jiffies_64_to_clock_t(delta_q - 1); + /* Sum data, update run queue time, and determine sleep time */ + memset(&total, 0, sizeof(total)); + for (i = 0 ; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + total.utime += cpu[i].utime; + total.stime += cpu[i].stime; + if ((runnable = cpu[i].utime + cpu[i].stime) > cpu[i].runnable) + cpu[i].runnable = runnable; + total.runnable += cpu[i].runnable; + total.sinbinned += cpu[i].sinbinned; + total.slices += cpu[i].slices; + } +#else + total.utime = jiffies_64_to_clock_t(p->utime); + total.stime = jiffies_64_to_clock_t(p->stime); + total.runnable = jiffies_64_to_clock_t(p->runnable); + total.sinbinned = jiffies_to_clock_t(p->sinbin_ticks); + total.slices = p->slices; + + /* be noncommital about status of current tick */ + if ((delta_q > 1) && !list_empty(&p->run_list)) + total.runnable += jiffies_64_to_clock_t(delta_q - 1); + + if ((runnable = total.utime + total.stime) > total.runnable) + total.runnable = runnable; +#endif + /* Print total to buffer and per cpu statistics */ + len = sprintf(buffer, "cpu %llu %llu %llu %llu %llu @ %llu\n", + total.utime, total.stime, total.runnable, + total.sinbinned, total.slices, now); + +#ifdef CONFIG_SMP + if (num_online_cpus() > 1) { + for (i = 0 ; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + len += sprintf(buffer + len, + "cpu%d %llu %llu %llu %llu %llu\n", + i, cpu[i].utime, cpu[i].stime, + cpu[i].runnable, cpu[i].sinbinned, + cpu[i].slices); + } + } +#endif + + return len; +} + + +/* + * With multiple CPUs some of these totals can easily get to big to + * fit in a long on 32 bit machines so use u64. + */ +struct system_cpu_stats { + u64 user; + u64 system; + u64 runnable; + u64 idle; + u64 iowait; + u64 nr_switches; +}; + +/* + * Invoked whenever /proc/cpustats is read. + */ +int cpustats_read_proc(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct system_cpu_stats total, cpu[NR_CPUS]; + u64 now = jiffies_64_to_clock_t(get_jiffies_64() - INITIAL_JIFFIES); + runqueue_t *rq; + int i, len; + + /* Take a snapshot as briefly as possible to maximise validity */ + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + rq = cpu_rq(i); + cpu[i].user = jiffies_64_to_clock_t(kstat_cpu(i).cpustat.user); + cpu[i].user += jiffies_64_to_clock_t(kstat_cpu(i).cpustat.nice); + cpu[i].system = jiffies_64_to_clock_t(kstat_cpu(i).cpustat.system); + cpu[i].runnable = jiffies_64_to_clock_t(kstat_cpu(i).cpustat.runnable); + cpu[i].idle = jiffies_64_to_clock_t(kstat_cpu(i).cpustat.idle); + cpu[i].iowait = jiffies_64_to_clock_t(kstat_cpu(i).cpustat.iowait); + cpu[i].nr_switches = rq->nr_switches; + } + + /* Work out the totals */ + memset(&total, 0, sizeof(total)); + for (i = 0 ; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + total.user += cpu[i].user; + total.system += cpu[i].system; + total.runnable += cpu[i].runnable; + total.idle += cpu[i].idle; + total.iowait += cpu[i].iowait; + total.nr_switches += cpu[i].nr_switches; + } + + /* Print totals to buffer followed by per cpu statistics */ + len = sprintf(page, "cpu %llu %llu %llu %llu %llu %llu @ %llu\n", + total.user, total.system, total.runnable, total.idle, + total.iowait, total.nr_switches, now); + + if (num_online_cpus() > 1) { + for (i = 0 ; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + len += sprintf(page + len, + "cpu%d %llu %llu %llu %llu %llu %llu\n", + i, cpu[i].user, cpu[i].system, + cpu[i].runnable, cpu[i].idle, + cpu[i].iowait, cpu[i].nr_switches); + } + } + + if (len <= off + count) + *eof = 1; + *start = page + off; + len -= off; + if (len > count) + len = count; + if (len < 0) + len = 0; + return len; +} +#endif Index: Linux-2.6.2/kernel/timer.c diff -u Linux-2.6.2/kernel/timer.c:1.1.1.1 Linux-2.6.2/kernel/timer.c:1.1.1.1.12.1 --- Linux-2.6.2/kernel/timer.c:1.1.1.1 Fri Feb 6 12:03:09 2004 +++ Linux-2.6.2/kernel/timer.c Sat Feb 7 23:06:40 2004 @@ -729,6 +729,10 @@ void update_one_process(struct task_struct *p, unsigned long user, unsigned long system, int cpu) { +#if defined(CONFIG_SCHED_STATS) && defined(CONFIG_SMP) + p->per_cpu_utime[cpu] += user; + p->per_cpu_stime[cpu] += system; +#endif do_process_times(p, user, system); do_it_virt(p, user); do_it_prof(p);