One of the building blocks to implement containers is Linux namespaces. Namespaces control what a process can see. It can be the processes IDs, mount points, network adapters and more.

To use namespaces we call the clone(2) system call.

Creating a child process – fork vs clone

To create a new process in Linux, we can use fork(2) or clone(2) system calls. We use fork(2) to create a new child process with a separate memory mapping (using CoW) , we use clone(2) to create a child process that shares resources with its parent. One use of clone is to implement multithreading, other use is to implement namespaces

Namespaces with clone(2)

To create a child process in a new namespace and isolated resources we need to use one or more of the following flags :

CLONE_NEWNET – isolate network devices

CLONE_NEWUTS – host and domain names (UNIX Timesharing System)

CLONE_NEWIPC – IPC objects

CLONE_NEWPID – PIDs

CLONE_NEWNS – mount points (file systems)

CLONE_NEWUSER – users and groups

Simple Example – NEWPID

To create a child process with PID=1 (new processes tree) call clone(2) with CLONE_NEWPID:

clone(child_fn, child_stack+5000, CLONE_NEWPID , NULL); 1 clone ( child_fn , child_stack + 5000 , CLONE _ NEWPID , NULL ) ;

getpid() on the child process returns 1, getppid() returns 0. If the child process creates another child it will get a process id from the new tree

Full example:

#define _GNU_SOURCE #include <sched.h> #include <stdio.h> #include <stdlib.h> #include <sys/wait.h> #include <unistd.h> #include <sys/types.h> #include <signal.h> static char child_stack[5000]; void grchild(int num) { printf("child(%d) in ns my PID: %d Parent ID=%d

", num, getpid(),getppid()); sleep(5); puts("end child"); } int child_fn(int ppid) { int i; printf("PID: %ld Parent:%ld

", (long)getpid(), getppid()); for(i=0;i<3;i++) { if(fork() == 0) { grchild(i+1); exit(0); } kill(ppid,SIGKILL); // no effect } sleep(2); kill(2,SIGKILL); // kill the first child sleep(10); return 0; } int main() { pid_t pid = clone(child_fn, child_stack+5000, CLONE_NEWPID , getpid()); printf("clone() = %d

", pid); waitpid(pid, NULL, 0); return 0; } 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 #define _GNU_SOURCE #include <sched.h> #include <stdio.h> #include <stdlib.h> #include <sys/wait.h> #include <unistd.h> #include <sys/types.h> #include <signal.h> static char child_stack [ 5000 ] ; void grchild ( int num ) { printf ( "child(%d) in ns my PID: %d Parent ID=%d

" , num , getpid ( ) , getppid ( ) ) ; sleep ( 5 ) ; puts ( "end child" ) ; } int child_fn ( int ppid ) { int i ; printf ( "PID: %ld Parent:%ld

" , ( long ) getpid ( ) , getppid ( ) ) ; for ( i = 0 ; i < 3 ; i ++ ) { if ( fork ( ) == 0 ) { grchild ( i + 1 ) ; exit ( 0 ) ; } kill ( ppid , SIGKILL ) ; // no effect } sleep ( 2 ) ; kill ( 2 , SIGKILL ) ; // kill the first child sleep ( 10 ) ; return 0 ; } int main ( ) { pid_t pid = clone ( child_fn , child_stack + 5000 , CLONE _ NEWPID , getpid ( ) ) ; printf ( "clone() = %d

" , pid ) ; waitpid ( pid , NULL , 0 ) ; return 0 ; }

The main creates a child process in a new PID namespace and send its PID to the child. The child creates 3 children.

If the child process try to kill the parent (out of its namespace) – nothing happens but it can kill a process in its namespace (in this case the first child)

If you build and run this sample (run with sudo)

# sudo ./simple clone() = 5439 PID: 1 Parent:0 child(3) in ns my PID: 4 Parent ID=1 child(2) in ns my PID: 3 Parent ID=1 child(1) in ns my PID: 2 Parent ID=1 end child end child 1 2 3 4 5 6 7 8 # sudo ./simple clone ( ) = 5439 PID : 1 Parent : 0 child ( 3 ) in ns my PID : 4 Parent ID = 1 child ( 2 ) in ns my PID : 3 Parent ID = 1 child ( 1 ) in ns my PID : 2 Parent ID = 1 end child end child

As you can see the PIDs are 1-4 and the first child didn’t finish (SIGKILL)

Isolates Network Interfaces

To create a child process with different network interfaces use CLONE_NEWNET:

pid_t pid = clone(child_fn, child_stack+1024*1024, CLONE_NEWNET , NULL); 1 pid_t pid = clone ( child_fn , child_stack + 1024 * 1024 , CLONE _ NEWNET , NULL ) ;

To create a virtual network adapter we can run ip command:

# sudo ip link add name veth0 type veth peer name veth1 netns [child pid] # sudo ifconfig veth0 10.0.0.3 1 2 # sudo ip link add name veth0 type veth peer name veth1 netns [child pid] # sudo ifconfig veth0 10.0.0.3

Now the child should run the command:

# ifconfig veth1 10.0.0.4 1 # ifconfig veth1 10.0.0.4

We can code all these commands but for simplicity lets use the system(3) library function

Full Example:

#define _GNU_SOURCE #include <sched.h> #include <stdio.h> #include <stdlib.h> #include <sys/wait.h> #include <unistd.h> static char child_stack[1024*1024]; static int child_fn() { sleep(1); system("ifconfig veth1 10.0.0.4"); puts("========= child network interfaces ========"); system("ifconfig -a"); puts("==========================================="); sleep(1); system("ping -c 3 10.0.0.3"); return 0; } int main() { char buf[255]; pid_t pid = clone(child_fn, child_stack+1024*1024, CLONE_NEWNET , NULL); sprintf(buf,"ip link add name veth0 type veth peer name veth1 netns %d",pid); system(buf); system("ifconfig veth0 10.0.0.3"); waitpid(pid, NULL, 0); return 0; } 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 #define _GNU_SOURCE #include <sched.h> #include <stdio.h> #include <stdlib.h> #include <sys/wait.h> #include <unistd.h> static char child_stack [ 1024 * 1024 ] ; static int child_fn ( ) { sleep ( 1 ) ; system ( "ifconfig veth1 10.0.0.4" ) ; puts ( "========= child network interfaces ========" ) ; system ( "ifconfig -a" ) ; puts ( "===========================================" ) ; sleep ( 1 ) ; system ( "ping -c 3 10.0.0.3" ) ; return 0 ; } int main ( ) { char buf [ 255 ] ; pid_t pid = clone ( child_fn , child_stack + 1024 * 1024 , CLONE _ NEWNET , NULL ) ; sprintf ( buf , "ip link add name veth0 type veth peer name veth1 netns %d" , pid ) ; system ( buf ) ; system ( "ifconfig veth0 10.0.0.3" ) ; waitpid ( pid , NULL , 0 ) ; return 0 ; }

Run this test – the output:

========= child network interfaces ======== lo Link encap:Local Loopback LOOPBACK MTU:65536 Metric:1 RX packets:0 errors:0 dropped:0 overruns:0 frame:0 TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 collisions:0 txqueuelen:1 RX bytes:0 (0.0 B) TX bytes:0 (0.0 B) veth1 Link encap:Ethernet HWaddr 7a:d6:68:fb:c0:04 inet addr:10.0.0.4 Bcast:10.255.255.255 Mask:255.0.0.0 inet6 addr: fe80::78d6:68ff:fefb:c004/64 Scope:Link UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 RX packets:0 errors:0 dropped:0 overruns:0 frame:0 TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 collisions:0 txqueuelen:1000 RX bytes:0 (0.0 B) TX bytes:0 (0.0 B) =========================================== PING 10.0.0.3 (10.0.0.3) 56(84) bytes of data. 64 bytes from 10.0.0.3: icmp_seq=1 ttl=64 time=0.076 ms 64 bytes from 10.0.0.3: icmp_seq=2 ttl=64 time=0.048 ms 64 bytes from 10.0.0.3: icmp_seq=3 ttl=64 time=0.071 ms --- 10.0.0.3 ping statistics --- 3 packets transmitted, 3 received, 0% packet loss, time 1999ms rtt min/avg/max/mdev = 0.048/0.065/0.076/0.012 ms 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 === === === child network interfaces === === == lo Link encap : Local Loopback LOOPBACK MTU : 65536 Metric : 1 RX packets : 0 errors : 0 dropped : 0 overruns : 0 frame : 0 TX packets : 0 errors : 0 dropped : 0 overruns : 0 carrier : 0 collisions : 0 txqueuelen : 1 RX bytes : 0 ( 0.0 B ) TX bytes : 0 ( 0.0 B ) veth1 Link encap : Ethernet HWaddr 7a : d6 : 68 : fb : c0 : 04 inet addr : 10.0.0.4 Bcast : 10.255.255.255 Mask : 255.0.0.0 inet6 addr : fe80 :: 78d6 : 68ff : fefb : c004 / 64 Scope : Link UP BROADCAST RUNNING MULTICAST MTU : 1500 Metric : 1 RX packets : 0 errors : 0 dropped : 0 overruns : 0 frame : 0 TX packets : 0 errors : 0 dropped : 0 overruns : 0 carrier : 0 collisions : 0 txqueuelen : 1000 RX bytes : 0 ( 0.0 B ) TX bytes : 0 ( 0.0 B ) === === === === === === === === === === === === === === = PING 10.0.0.3 ( 10.0.0.3 ) 56 ( 84 ) bytes of data . 64 bytes from 10.0.0.3 : icmp_seq = 1 ttl = 64 time = 0.076 ms 64 bytes from 10.0.0.3 : icmp_seq = 2 ttl = 64 time = 0.048 ms 64 bytes from 10.0.0.3 : icmp_seq = 3 ttl = 64 time = 0.071 ms -- - 10.0.0.3 ping statistics -- - 3 packets transmitted , 3 received , 0 % packet loss , time 1999ms rtt min / avg / max / mdev = 0.048 / 0.065 / 0.076 / 0.012 ms

The child sees only the virtual adapter and can ping the parent using it

Mount Points and file system

To implement a container we need to isolate also the file system. It can be done using CLONE_NEWNS. Before coding , lets build a simple file system using BusyBox or BuildRoot

The simplest way is using buildroot – it is based on busybox.

Download and extract the package, use make menuconfig to enter the configuration menu, just exit and save the default selection and run make

# tar xvf ./buildroot-2017.11.2.tar.bz2 # cd buildroot-2017.11.2 # make menuconfig # make 1 2 3 4 # tar xvf ./buildroot-2017.11.2.tar.bz2 # cd buildroot-2017.11.2 # make menuconfig # make

It will take a few minutes , after the build is finished you will find a file system in buildroot-2017.11.2/output/target

copy the content to another folder – in my example fs and add some device files to the /dev directory using mknod commands (buildroot can’t do that because it doesn’t run with sudo)

Full Example

#define _GNU_SOURCE #include <sys/types.h> #include <sys/wait.h> #include <sys/mount.h> #include <stdio.h> #include <sched.h> #include <signal.h> #include <unistd.h> #include <sys/ioctl.h> #include <arpa/inet.h> #include <net/if.h> #include <string.h> #define STACK_SIZE (1024 * 1024) static char stack[STACK_SIZE]; int setip(char *name,char *addr,char *netmask) { struct ifreq ifr; int fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); strncpy(ifr.ifr_name, name, IFNAMSIZ); ifr.ifr_addr.sa_family = AF_INET; inet_pton(AF_INET, addr, ifr.ifr_addr.sa_data + 2); ioctl(fd, SIOCSIFADDR, &ifr); inet_pton(AF_INET, netmask, ifr.ifr_addr.sa_data + 2); ioctl(fd, SIOCSIFNETMASK, &ifr); //get flags ioctl(fd, SIOCGIFFLAGS, &ifr); strncpy(ifr.ifr_name, name, IFNAMSIZ); ifr.ifr_flags |= (IFF_UP | IFF_RUNNING); // set flags ioctl(fd, SIOCSIFFLAGS, &ifr); return 0; } int child(void* arg) { char c; sleep(1); sethostname("myhost", 6); chroot("./fs"); chdir("/"); mount("proc", "/proc", "proc", 0, NULL); setip("veth1","10.0.0.15","255.0.0.0"); execlp("/bin/sh", "/bin/sh" , NULL); return 1; } int main() { char buf[255]; pid_t pid = clone(child, stack+STACK_SIZE, CLONE_NEWNET | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNS | SIGCHLD, NULL); sprintf(buf,"sudo ip link add name veth0 type veth peer name veth1 netns %d",pid); system(buf); setip("veth0","10.0.0.13","255.0.0.0"); waitpid(pid, NULL, 0); return 0; } 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 #define _GNU_SOURCE #include <sys/types.h> #include <sys/wait.h> #include <sys/mount.h> #include <stdio.h> #include <sched.h> #include <signal.h> #include <unistd.h> #include <sys/ioctl.h> #include <arpa/inet.h> #include <net/if.h> #include <string.h> #define STACK_SIZE (1024 * 1024) static char stack [ STACK_SIZE ] ; int setip ( char * name , char * addr , char * netmask ) { struct ifreq ifr ; int fd = socket ( PF_INET , SOCK_DGRAM , IPPROTO_IP ) ; strncpy ( ifr . ifr_name , name , IFNAMSIZ ) ; ifr . ifr_addr . sa_family = AF_INET ; inet_pton ( AF_INET , addr , ifr . ifr_addr . sa_data + 2 ) ; ioctl ( fd , SIOCSIFADDR , & ifr ) ; inet_pton ( AF_INET , netmask , ifr . ifr_addr . sa_data + 2 ) ; ioctl ( fd , SIOCSIFNETMASK , & ifr ) ; //get flags ioctl ( fd , SIOCGIFFLAGS , & ifr ) ; strncpy ( ifr . ifr_name , name , IFNAMSIZ ) ; ifr . ifr_flags |= ( IFF_UP | IFF_RUNNING ) ; // set flags ioctl ( fd , SIOCSIFFLAGS , & ifr ) ; return 0 ; } int child ( void * arg ) { char c ; sleep ( 1 ) ; sethostname ( "myhost" , 6 ) ; chroot ( "./fs" ) ; chdir ( "/" ) ; mount ( "proc" , "/proc" , "proc" , 0 , NULL ) ; setip ( "veth1" , "10.0.0.15" , "255.0.0.0" ) ; execlp ( "/bin/sh" , "/bin/sh" , NULL ) ; return 1 ; } int main ( ) { char buf [ 255 ] ; pid_t pid = clone ( child , stack + STACK_SIZE , CLONE_NEWNET | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNS | SIGCHLD , NULL ) ; sprintf ( buf , "sudo ip link add name veth0 type veth peer name veth1 netns %d" , pid ) ; system ( buf ) ; setip ( "veth0" , "10.0.0.13" , "255.0.0.0" ) ; waitpid ( pid , NULL , 0 ) ; return 0 ; }

We create a child process in a new namespace (with PIDs, network, mounts, IPC and UTS) , the parent configure the virtual adapters (using ip link) and set its ip address

The child change the hostname, change the root folder to our buildroot output , change the current directory to ‘/’ , mount proc so ps and other tools will work and set an ip address.

The last step the child does is calling the busybox shell (/bin/sh)

Run this program using sudo – you will get a different shell, file system and network:

Thats it!!

You can find the code with the full Buildroot image here

This is just a simple example to understand the concept. to implement a full container you need also to add capabilities, control groups and more