While configuring a Linux kernel, we can set some parameters that effect the system behavior. You can work with different priorities, scheduling classes and preemption models. It is very important to understand and choose the right parameters.

In this post I will cover the different preemption models and how does each one effect the user and kernel behaviour

If you configure the kernel (using make menuconfig) , you can find, usually in kernel features sub menu the option – Preemption model:

To understand each option lets take an example:

We have 2 threads – one with high real time priority(50) and the other with low RT priority(30)

The high priority thread went to sleep for 3 seconds

The low priority threads uses the CPU for user space calculations

After 3 seconds the high priority thread will wake up

This case is easy and reasonable but what happens if the low priority thread call a kernel code while the high priority is sleeping? It is depends on the above configuration

No Forced Preemption

The context switch is done only when we return from the kernel. Lets take an example :

We have 2 threads – one with high real time priority(50) and the other with low RT priority(30)

The high priority thread went to sleep for 3 seconds

The low priority threads calls a kernel code that last for 5 seconds

After 5 seconds the low priority thread returns from the kernel

The high priority thread will wake up (2 seconds late)

Lets see the code:

Kernel code – simple character device driver:

#include <asm/uaccess.h> #include <linux/fs.h> #include <linux/gfp.h> #include <linux/cdev.h> #include <linux/sched.h> #include <linux/kdev_t.h> #include <linux/delay.h> #include <linux/ioctl.h> #include <linux/slab.h> #include <linux/mempool.h> #include <linux/mm.h> #include <asm/io.h> static dev_t my_dev; static struct cdev *my_cdev; // callback for read system call on the device static ssize_t my_read(struct file *file, char __user *buf,size_t count,loff_t *ppos) { int len=5; if(*ppos > 0) { return 0; } mdelay(5000); // busy-wait for 5 seconds if (copy_to_user(buf , "hello" , len)) { return -EFAULT; } else { *ppos +=len; return len; } } static struct file_operations my_fops = { .owner = THIS_MODULE, .read = my_read, }; static int hello_init (void) { my_dev = MKDEV(400,0); register_chrdev_region(my_dev,1,"demo"); my_cdev=cdev_alloc(); if(!my_cdev) { printk (KERN_INFO "cdev alloc error.

"); return -1; } my_cdev->ops = &my_fops; my_cdev->owner = THIS_MODULE; if(cdev_add(my_cdev,my_dev,1)) { printk (KERN_INFO "cdev add error.

"); return -1; } return 0; } static void hello_cleanup (void) { cdev_del(my_cdev); unregister_chrdev_region(my_dev, 1); } module_init (hello_init); module_exit (hello_cleanup); MODULE_LICENSE("GPL"); 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 #include <asm/uaccess.h> #include <linux/fs.h> #include <linux/gfp.h> #include <linux/cdev.h> #include <linux/sched.h> #include <linux/kdev_t.h> #include <linux/delay.h> #include <linux/ioctl.h> #include <linux/slab.h> #include <linux/mempool.h> #include <linux/mm.h> #include <asm/io.h> static dev_t my_dev ; static struct cdev * my_cdev ; // callback for read system call on the device static ssize_t my_read ( struct file * file , char __user * buf , size_t count , loff_t * ppos ) { int len = 5 ; if ( * ppos > 0 ) { return 0 ; } mdelay ( 5000 ) ; // busy-wait for 5 seconds if ( copy_to_user ( buf , "hello" , len ) ) { return - EFAULT ; } else { * ppos += len ; return len ; } } static struct file_operations my_fops = { . owner = THIS_MODULE , . read = my_read , } ; static int hello_init ( void ) { my_dev = MKDEV ( 400 , 0 ) ; register_chrdev_region ( my_dev , 1 , "demo" ) ; my_cdev = cdev_alloc ( ) ; if ( ! my_cdev ) { printk ( KERN _ INFO "cdev alloc error.

" ) ; return - 1 ; } my_cdev -> ops = & my_fops ; my_cdev -> owner = THIS_MODULE ; if ( cdev_add ( my_cdev , my_dev , 1 ) ) { printk ( KERN _ INFO "cdev add error.

" ) ; return - 1 ; } return 0 ; } static void hello_cleanup ( void ) { cdev_del ( my_cdev ) ; unregister_chrdev_region ( my_dev , 1 ) ; } module_init ( hello_init ) ; module_exit ( hello_cleanup ) ; MODULE_LICENSE ( "GPL" ) ;

The read is delaying for 5 seconds (delay is a busy wait loop) and returns some data

The User space code:

#include<stdio.h> #include<unistd.h> #include<pthread.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> void *hi_prio(void *p) { printf("thread1 start time=%ld

",time(NULL)); sleep(3); printf("thread1 stop time=%ld

",time(NULL)); return NULL; } void *low_prio(void *p) { char buf[20]; sleep(1); int fd=open("/dev/demo",O_RDWR); // #mknod /dev/demo c 400 0 puts("thread2 start"); read(fd,buf,20); puts("thread2 stop"); return NULL; } int main() { pthread_t t1,t2,t3; pthread_attr_t attr; struct sched_param param; pthread_attr_init(&attr); pthread_attr_setschedpolicy(&attr, SCHED_RR); param.sched_priority = 50; pthread_attr_setschedparam(&attr, ¶m); pthread_create(&t1,&attr,hi_prio,NULL); param.sched_priority = 30; pthread_attr_setschedparam(&attr, ¶m); pthread_create(&t2,&attr,low_prio,NULL); sleep(10); puts("end test"); return 0; } 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 #include<stdio.h> #include<unistd.h> #include<pthread.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> void * hi_prio ( void * p ) { printf ( "thread1 start time=%ld

" , time ( NULL ) ) ; sleep ( 3 ) ; printf ( "thread1 stop time=%ld

" , time ( NULL ) ) ; return NULL ; } void * low_prio ( void * p ) { char buf [ 20 ] ; sleep ( 1 ) ; int fd = open ( "/dev/demo" , O_RDWR ) ; // #mknod /dev/demo c 400 0 puts ( "thread2 start" ) ; read ( fd , buf , 20 ) ; puts ( "thread2 stop" ) ; return NULL ; } int main ( ) { pthread_t t1 , t2 , t3 ; pthread_attr_t attr ; struct sched_param param ; pthread_attr_init ( & attr ) ; pthread_attr_setschedpolicy ( & attr , SCHED_RR ) ; param . sched_priority = 50 ; pthread_attr_setschedparam ( & attr , & param ) ; pthread_create ( & t1 , & attr , hi_prio , NULL ) ; param . sched_priority = 30 ; pthread_attr_setschedparam ( & attr , & param ) ; pthread_create ( & t2 , & attr , low_prio , NULL ) ; sleep ( 10 ) ; puts ( "end test" ) ; return 0 ; }

The high priority goes to sleep for 3 seconds.

The low priority thread is sleeping for one second and then calls the kernel

The high priority is wake after 6 seconds:

# insmod demo.ko # ./app thread1 start time=182 thread2 start thread1 stop time=188 thread2 stop end test 1 2 3 4 5 6 7 # insmod demo.ko # ./app thread1 start time = 182 thread2 start thread1 stop time = 188 thread2 stop end test

Preemptible Kernel

In this configuration the context switch is done on time also in the kernel, means if we run the above test we will see the high priority thread waking up after 3 seconds:

It means that in this options the system will perform more context switches per second but it is more “real time”. On embedded systems with soft real time requirements it is a best practice to use this option but in a server system that we are usually work asynchronously the first option is better – less context switches – more cpu time

The output:

# insmod ./demo.ko #./app thread1 start time=234 thread2 start thread1 stop time=237 thread2 stop end test 1 2 3 4 5 6 7 # insmod ./demo.ko #./app thread1 start time = 234 thread2 start thread1 stop time = 237 thread2 stop end test

Voluntary Kernel Preemption

In this configuration the system is working like “no forced preemption” but if the kernel developer is writing a complex code it is responsible to check from time to time if a re scheduling is needed. He can do that with might_resched() function

So in this example, if we want to add this “check point” we will change the code:

// callback for read system call on the device static ssize_t my_read(struct file *file, char __user *buf,size_t count,loff_t *ppos) { int len=5; if(*ppos > 0) { return 0; } mdelay(4000); // busy-wait for 4 seconds might_resched(); delay(3000); // busy wait for 3 seconds if (copy_to_user(buf , "hello" , len)) { return -EFAULT; } else { *ppos +=len; return len; } } 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 // callback for read system call on the device static ssize_t my_read ( struct file * file , char __user * buf , size_t count , loff_t * ppos ) { int len = 5 ; if ( * ppos > 0 ) { return 0 ; } mdelay ( 4000 ) ; // busy-wait for 4 seconds might_resched ( ) ; delay ( 3000 ) ; // busy wait for 3 seconds if ( copy_to_user ( buf , "hello" , len ) ) { return - EFAULT ; } else { * ppos += len ; return len ; } }

If we comment out the line might_resched() it will be delayed for 7 seconds total, adding a cond_resched call will check and perform the context switch if other hi priority threads is awake. It will be called after 5 seconds (1 second before the call and 4 seconds in the kernel)

Output:

# insmod ./demo.ko #./app thread1 start time=320 thread2 start thread1 stop time=325 thread2 stop end test 1 2 3 4 5 6 7 # insmod ./demo.ko #./app thread1 start time = 320 thread2 start thread1 stop time = 325 thread2 stop end test

Full Real Time Preemption

If you apply the RT patch , You get an Hard realtime kernel. This means any code can block other , if you run an interrupt service routine code and something more urgent need to be handled it will block the ISR code. The patch changes the following:

Converting hardware Interrupts to threads with RT priority 50

Converting SoftIRQs to threads with RT 49 priority

Converting all spinlocks to mutexes

Configuring and using Hi resolution timers

some more minor features

After applying the patch you can see 2 more options in the menu:

The option “Preemptible Kernel (Basic RT)” is for debugging (see documentation)

To make all the above changes you need to select the last option – Fully Preemptible Kernel.

Now if you create a thread with RT priority bigger than 50 – it will block interrupts

Note that in this configuration the system has more tasks and performs more context switches per second. i.e. the CPU is spending more time switching tasks but we can hit any deadline required (1ms or more)