# # old_revision [c9c382f0a12ab75bee370938f9b7ad5b582cf39a] # # patch "arch/x86_64/kernel/smp.c" # from [b354a6e4a080ad49f4213ab4334ca2e57ddf1bdc] # to [a76a90aaad275ab3775d32ac3ae500fe120c5b8c] # # patch "arch/x86_64/mm/numa.c" # from [72976a7a4aff795f020f34e91e98e0defa280d3b] # to [9f2858d19a369e3aceaf084796837bd02e9bc6b7] # # patch "fs/jbd/journal.c" # from [790c84b34b33cbaac872a27bbb35a29a807829f1] # to [05c6974b6cad8f3608ae4e588bda364e57a0722c] # # patch "include/linux/hardirq.h" # from [eec8eb859010dd92053d316fca021c538837dc37] # to [d8f5b5db09aec3b63922f5467205afdad256c8e6] # # patch "include/linux/init_task.h" # from [938a1587ab9e35bb8d24cf843d4e7424e3030a4c] # to [b7acecacd94e4ee753c833bc8dd84623975927f9] # # patch "include/linux/preempt.h" # from [4889dfe3225043bc783885e6d78e8907c30f74e0] # to [742d8ba8fff17e6e7038208d57c8b474f578653e] # # patch "include/linux/sched.h" # from [0ed8993484be9c13728f4ebdaa51fc0f0c229018] # to [c65ebaa452498f611280baafd8ee6282ea0746f2] # # patch "include/linux/seqlock.h" # from [e0e6ee774c4c5dd9f3e0fd4d3ada566cda45d11b] # to [d0b08ef3758e5c0fa679d44a67582ca0b03878de] # # patch "kernel/Makefile" # from [24ff9cac9c543a2d7f3581ddb005b29f43854a79] # to [bf4003ae851e4a1ff34e3b4ae95a54a9e77c2392] # # patch "kernel/exit.c" # from [f4cc2f8e48a262bd26b6fc5d1253a8482c8c2d04] # to [38720025341ef0a1c923a06ecf330fffb718c62c] # # patch "kernel/fork.c" # from [506dabd42d242f78e0321594c7723481e0cd87dc] # to [8fd6fe77d8bf899f065f2e5d76d551b105afbfad] # # patch "kernel/panic.c" # from [94280f59f220a9b9ff297af039de0a0da844a23a] # to [f8c9b1f8eb91e63e8e38ca3df568ccf0b60f1f29] # # patch "kernel/printk.c" # from [18a2601f7196e660485677e2e27d7be7a5d86a8c] # to [311014984cb83004427d564b6cbe67b7747a49a7] # # patch "kernel/rtmutex.c" # from [d9e1288a369758ad7144999ef59019a51e17ce7a] # to [c95b398e5ee1777d0a224c112d3b42c19bf68516] # # patch "kernel/sched.c" # from [c305b32993a0e901b43550cfcc9ab48f1fa44912] # to [75f5479ee93f36a8625ec7bbc7421c41c7f842e9] # ============================================================ --- arch/x86_64/kernel/smp.c b354a6e4a080ad49f4213ab4334ca2e57ddf1bdc +++ arch/x86_64/kernel/smp.c a76a90aaad275ab3775d32ac3ae500fe120c5b8c @@ -57,7 +57,7 @@ struct mm_struct *flush_mm; unsigned long flush_va; #define FLUSH_ALL -1ULL - spinlock_t tlbstate_lock; + raw_spinlock_t tlbstate_lock; }; char pad[SMP_CACHE_BYTES]; } ____cacheline_aligned; ============================================================ --- arch/x86_64/mm/numa.c 72976a7a4aff795f020f34e91e98e0defa280d3b +++ arch/x86_64/mm/numa.c 9f2858d19a369e3aceaf084796837bd02e9bc6b7 @@ -317,6 +317,7 @@ { int i; unsigned long pages = 0; + for_each_online_node(i) { pages += free_all_bootmem_node(NODE_DATA(i)); } ============================================================ --- fs/jbd/journal.c 790c84b34b33cbaac872a27bbb35a29a807829f1 +++ fs/jbd/journal.c 05c6974b6cad8f3608ae4e588bda364e57a0722c @@ -146,7 +146,7 @@ jbd_debug(1, "OK, requests differ\n"); spin_unlock(&journal->j_state_lock); del_timer_sync(&journal->j_commit_timer); - journal_commit_transaction(journal); + journal_commit_transaction(journal); // funky schedule() stuff here --billh spin_lock(&journal->j_state_lock); goto loop; } @@ -160,7 +160,7 @@ */ jbd_debug(1, "Now suspending kjournald\n"); spin_unlock(&journal->j_state_lock); - refrigerator(); + refrigerator(); // funky schedule() stuff here as well --billh spin_lock(&journal->j_state_lock); } else { /* @@ -182,7 +182,7 @@ should_sleep = 0; if (should_sleep) { spin_unlock(&journal->j_state_lock); - schedule(); + schedule(); // more of the same --billh spin_lock(&journal->j_state_lock); } finish_wait(&journal->j_wait_commit, &wait); ============================================================ --- include/linux/hardirq.h eec8eb859010dd92053d316fca021c538837dc37 +++ include/linux/hardirq.h d8f5b5db09aec3b63922f5467205afdad256c8e6 @@ -58,6 +58,8 @@ # error PREEMPT_ACTIVE is too low! #endif +#define ATOMIC_MASK (~PREEMPT_ACTIVE) + #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #define softirq_count() (preempt_count() & SOFTIRQ_MASK) #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK)) @@ -70,12 +72,17 @@ #define in_softirq() (softirq_count() || (current->flags & PF_SOFTIRQ)) #define in_interrupt() (irq_count()) +/* + * Funky magic number logic here means: + * + * ...I need to finish this write up.... --billh + */ #if defined(CONFIG_PREEMPT) && \ !defined(CONFIG_PREEMPT_BKL) && \ !defined(CONFIG_PREEMPT_RT) -# define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != kernel_locked()) +# define in_atomic() ((preempt_count() & ATOMIC_MASK) != kernel_locked()) #else -# define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) +# define in_atomic() ((preempt_count() & ATOMIC_MASK) != 0) #endif #ifdef CONFIG_PREEMPT ============================================================ --- include/linux/init_task.h 938a1587ab9e35bb8d24cf843d4e7424e3030a4c +++ include/linux/init_task.h b7acecacd94e4ee753c833bc8dd84623975927f9 @@ -126,7 +126,8 @@ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .fs_excl = ATOMIC_INIT(0), \ .posix_timer_list = NULL, \ - INIT_RT_MUTEXES(tsk) \ + INIT_RT_MUTEXES(tsk), \ + .delayed_drop = LIST_HEAD_INIT(tsk.delayed_drop) \ } ============================================================ --- include/linux/sched.h 0ed8993484be9c13728f4ebdaa51fc0f0c229018 +++ include/linux/sched.h c65ebaa452498f611280baafd8ee6282ea0746f2 @@ -1082,6 +1091,9 @@ * cache last used pipe for splice */ struct pipe_inode_info *splice_pipe; + + /* --billh */ + struct list_head delayed_drop; // should investigate how do_fork() handles this as well }; static inline pid_t process_group(struct task_struct *tsk) ============================================================ --- include/linux/seqlock.h e0e6ee774c4c5dd9f3e0fd4d3ada566cda45d11b +++ include/linux/seqlock.h d0b08ef3758e5c0fa679d44a67582ca0b03878de @@ -1,8 +1,8 @@ #ifndef __LINUX_SEQLOCK_H #define __LINUX_SEQLOCK_H /* * Reader/writer consistent mechanism without starving writers. This type of - * lock for data where the reader wants a consitent set of information + * lock for data where the reader wants a consistent set of information * and is willing to retry if the information changes. Readers never * block but they may have to retry if a writer is in * progress. Writers do not wait for readers. ============================================================ --- kernel/exit.c f4cc2f8e48a262bd26b6fc5d1253a8482c8c2d04 +++ kernel/exit.c 38720025341ef0a1c923a06ecf330fffb718c62c @@ -131,11 +131,6 @@ } } -static void delayed_put_task_struct(struct rcu_head *rhp) -{ - put_task_struct(container_of(rhp, struct task_struct, rcu)); -} - void release_task(struct task_struct * p) { int zap_leader; @@ -177,7 +172,7 @@ spin_unlock(&p->proc_lock); proc_pid_flush(proc_dentry); release_thread(p); - call_rcu(&p->rcu, delayed_put_task_struct); + put_task_struct(p); p = leader; if (unlikely(zap_leader)) ============================================================ --- kernel/fork.c 506dabd42d242f78e0321594c7723481e0cd87dc +++ kernel/fork.c 8fd6fe77d8bf899f065f2e5d76d551b105afbfad @@ -75,7 +75,10 @@ */ static DEFINE_PER_CPU(struct task_struct *, desched_task); -static DEFINE_PER_CPU(struct list_head, delayed_drop_list); +static DEFINE_PER_CPU(struct list_head, delayed_mmdrop_list); +#ifdef CONFIG_PREEMPT_RT +static DEFINE_PER_CPU(struct list_head, delayed_put_task_struct_list); //--bilh +#endif int nr_processes(void) { @@ -120,7 +123,17 @@ } EXPORT_SYMBOL(free_task); +void fastcall free_task_delayed(struct task_struct *task); + +/* + * Delay if this is in an atomic critical section otherwise inline the deallocation + * --billh + */ +#ifdef CONFIG_PREEMPT_RT +void __put_task_struct_inline(struct task_struct *tsk) +#else void __put_task_struct(struct task_struct *tsk) +#endif { WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); BUG_ON(atomic_read(&tsk->usage)); @@ -134,7 +147,7 @@ if (!profile_handoff_task(tsk)) free_task(tsk); } - + void __init fork_init(unsigned long mempages) { int i; @@ -167,8 +180,12 @@ init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; - for (i = 0; i < NR_CPUS; i++) - INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i)); + for (i = 0; i < NR_CPUS; i++) { + INIT_LIST_HEAD(&per_cpu(delayed_mmdrop_list, i)); +#ifdef CONFIG_PREEMPT_RT + INIT_LIST_HEAD(&per_cpu(delayed_put_task_struct_list, i)); //--billh +#endif + } } static struct task_struct *dup_task_struct(struct task_struct *orig) @@ -1067,6 +1084,9 @@ #endif rt_mutex_init_task(p); +#ifdef CONFIG_PREEMPT_RT + INIT_LIST_HEAD(&p->delayed_drop); //--billh +#endif #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ @@ -1693,24 +1713,73 @@ return err; } +static void _wake_cpu_desched_task(void) +{ + struct task_struct *desched_task; + + desched_task = __get_cpu_var(desched_task); + if (desched_task) + wake_up_process(desched_task); +} + +#ifdef CONFIG_PREEMPT_RT +static int put_task_struct_complete(void) +{ + struct list_head *head; + int ret = 0; + + head = &get_cpu_var(delayed_put_task_struct_list); + while (!list_empty(head)) { + struct task_struct *task = list_entry(head->next, + struct task_struct, delayed_drop); + list_del(&task->delayed_drop); + put_cpu_var(delayed_put_task_struct_list); + + __put_task_struct_inline(task); // call the original function to perform the operation + ret = 1; + + head = &get_cpu_var(delayed_put_task_struct_list); + } + put_cpu_var(delayed_put_task_struct_list); + + return ret; +} + +/* + * We dont want to do complex work from the scheduler with preemption + * disabled, therefore we delay the work to a per-CPU worker thread. + */ +void fastcall __put_task_struct(struct task_struct *task) +{ + struct list_head *head; + + head = &get_cpu_var(delayed_put_task_struct_list); + list_add_tail(&task->delayed_drop, head); + + _wake_cpu_desched_task(); + + put_cpu_var(delayed_put_task_struct_list); +} +#endif + static int mmdrop_complete(void) { struct list_head *head; int ret = 0; - head = &get_cpu_var(delayed_drop_list); + head = &get_cpu_var(delayed_mmdrop_list); while (!list_empty(head)) { struct mm_struct *mm = list_entry(head->next, struct mm_struct, delayed_drop); list_del(&mm->delayed_drop); - put_cpu_var(delayed_drop_list); + put_cpu_var(delayed_mmdrop_list); __mmdrop(mm); ret = 1; - head = &get_cpu_var(delayed_drop_list); + head = &get_cpu_var(delayed_mmdrop_list); } - put_cpu_var(delayed_drop_list); + put_cpu_var(delayed_mmdrop_list); return ret; } @@ -1721,15 +1790,14 @@ */ void fastcall __mmdrop_delayed(struct mm_struct *mm) { - struct task_struct *desched_task; struct list_head *head; - head = &get_cpu_var(delayed_drop_list); + head = &get_cpu_var(delayed_mmdrop_list); list_add_tail(&mm->delayed_drop, head); - desched_task = __get_cpu_var(desched_task); - if (desched_task) - wake_up_process(desched_task); - put_cpu_var(delayed_drop_list); + + _wake_cpu_desched_task(); + + put_cpu_var(delayed_mmdrop_list); } static int desched_thread(void * __bind_cpu) @@ -1743,6 +1811,9 @@ if (mmdrop_complete()) continue; + if (put_task_struct_complete()) + continue; + schedule(); /* This must be called from time to time on ia64, and is a no-op on other archs. @@ -1767,7 +1838,10 @@ case CPU_UP_PREPARE: BUG_ON(per_cpu(desched_task, hotcpu)); - INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu)); + INIT_LIST_HEAD(&per_cpu(delayed_mmdrop_list, hotcpu)); +#ifdef CONFIG_PREEMPT_RT + INIT_LIST_HEAD(&per_cpu(delayed_put_task_struct_list, hotcpu)); // --billh +#endif p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu); if (IS_ERR(p)) { printk("desched_thread for %i failed\n", hotcpu); ============================================================ --- kernel/panic.c 94280f59f220a9b9ff297af039de0a0da844a23a +++ kernel/panic.c f8c9b1f8eb91e63e8e38ca3df568ccf0b60f1f29 @@ -65,7 +65,9 @@ #if defined(CONFIG_S390) unsigned long caller = (unsigned long) __builtin_return_address(0); #endif - +#ifdef CONFIG_LATENCY_TRACE + stop_trace(); +#endif /* * It's possible to come here directly from a panic-assertion and not * have preempt disabled. Some functions called from here want ============================================================ --- kernel/printk.c 18a2601f7196e660485677e2e27d7be7a5d86a8c +++ kernel/printk.c 311014984cb83004427d564b6cbe67b7747a49a7 @@ -1079,13 +1079,20 @@ void __WARN_ON(const char *func, const char *file, const int line) { unsigned long flags; + int oops_in_progress_state = oops_in_progress; +#ifdef CONFIG_PREEMPT_RT // bad hack, works for now --billh + oops_in_progress = 1; +#endif spin_lock_irqsave(&warn_lock, flags); printk("%s/%d[CPU#%d]: BUG in %s at %s:%d\n", current->comm, current->pid, raw_smp_processor_id(), func, file, line); dump_stack(); spin_unlock_irqrestore(&warn_lock, flags); +#ifdef CONFIG_PREEMPT_RT + oops_in_progress = oops_in_progress_state; +#endif } EXPORT_SYMBOL(__WARN_ON); ============================================================ --- kernel/rtmutex.c d9e1288a369758ad7144999ef59019a51e17ce7a +++ kernel/rtmutex.c c95b398e5ee1777d0a224c112d3b42c19bf68516 @@ -13,6 +13,9 @@ #include #include +#include //-billh +#include //-billh + #include "rtmutex_common.h" #ifdef CONFIG_DEBUG_RT_MUTEXES @@ -58,12 +61,21 @@ val |= RT_MUTEX_HAS_WAITERS; lock->owner = (struct task_struct *)val; + barrier(); } +static void +rt_mutex_clear_owner(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) NULL; + barrier(); +} + static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); + barrier(); } static void fixup_rt_mutex_waiters(struct rt_mutex *lock) @@ -84,6 +96,7 @@ do { owner = *p; + cpu_relax(); } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); } #else @@ -92,6 +105,7 @@ { lock->owner = (struct task_struct *) ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); + barrier(); } #endif @@ -140,7 +154,7 @@ int prio = rt_mutex_getprio(task); if (task->prio != prio) - rt_mutex_setprio(task, prio); + rt_mutex_setprio(task, prio); /* reschedules task if the priority is lower, holds the run queue lock */ } /* @@ -725,7 +739,7 @@ rt_mutex_deadlock_account_unlock(current); if (!rt_mutex_has_waiters(lock)) { - lock->owner = NULL; + rt_mutex_clear_owner(lock); spin_unlock(&lock->wait_lock); return; } @@ -738,8 +752,29 @@ rt_mutex_adjust_prio(current); } +#define PANIC_IF_IN_ATOMIC() \ + if ( \ + (system_state == SYSTEM_RUNNING) && \ + in_atomic() && \ + !oops_in_progress && \ + !current->exit_state \ + ) { \ + panic("%s: in atomic: " "%s/0x%08x/%d\n", \ + __func__, current->comm, preempt_count(), current->pid); \ + } + +//static int test_var = 1000; void __lockfunc rt_lock(struct rt_mutex *lock) { + PANIC_IF_IN_ATOMIC(); +/* + if (system_state == SYSTEM_RUNNING && --test_var == 0) { + printk("%s: blah\n", __func__); + WARN_ON(1); + } + */ + + might_sleep(); rt_lock_fastlock(lock, rt_lock_slowlock); } EXPORT_SYMBOL(rt_lock); @@ -952,7 +987,7 @@ rt_mutex_deadlock_account_unlock(current); if (!rt_mutex_has_waiters(lock)) { - lock->owner = NULL; + rt_mutex_clear_owner(lock); spin_unlock(&lock->wait_lock); return; } @@ -1027,8 +1062,9 @@ */ void __sched rt_mutex_lock(struct rt_mutex *lock) { - might_sleep(); + PANIC_IF_IN_ATOMIC(); + might_sleep(); rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_lock); @@ -1048,7 +1084,6 @@ int detect_deadlock) { might_sleep(); - return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, detect_deadlock, rt_mutex_slowlock); } @@ -1133,7 +1168,7 @@ */ void __rt_mutex_init(struct rt_mutex *lock, const char *name) { - lock->owner = NULL; + rt_mutex_clear_owner(lock); spin_lock_init(&lock->wait_lock); plist_head_init(&lock->wait_list, &lock->wait_lock); ============================================================ --- kernel/sched.c c305b32993a0e901b43550cfcc9ab48f1fa44912 +++ kernel/sched.c 75f5479ee93f36a8625ec7bbc7421c41c7f842e9 @@ -3664,9 +3664,8 @@ __setup("preempt=", preempt_setup); /* - * this is is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt - * occur there and call schedule directly. + * This is the entry point to schedule() from an "in-kernel preemption" + * from a preempt_enable check. */ asmlinkage void __sched preempt_schedule(void) { @@ -3686,7 +3685,7 @@ need_resched: local_irq_disable(); - add_preempt_count(PREEMPT_ACTIVE); + add_preempt_count(PREEMPT_ACTIVE); // Calls to the latency/preemption tracer. /* * We keep the big kernel semaphore locked, but we * clear ->lock_depth so that schedule() doesnt @@ -3711,10 +3710,10 @@ EXPORT_SYMBOL(preempt_schedule); /* - * this is is the entry point for the IRQ return path. Called with - * interrupts disabled. To avoid infinite irq-entry recursion problems - * with fast-paced IRQ sources we do all of this carefully to never - * enable interrupts again. + * This is the entry point for the IRQ return path. Called with interrupts + * disabled. To avoid infinite irq-entry recursion problems with fast-paced + * IRQ sources, we do all of this carefully to never enable interrupts again + * since it's going back to the IRQ return code. */ asmlinkage void __sched preempt_schedule_irq(void) {