Requirements

Basic knowledge about C programming

CUDA supported device (CUDA GPU list)

Installed CUDA SDK

Eclipse/nvidia nsight (nsight recommended)

Goal

The aim of this article is to get acquainted with bilinear interpolation technique and implement demo program which is going to scale given input image using nvidia CUDA SDK. The concept of interpolation is not explained here.

Optimus (hybrid graphics card)

What if I have CUDA compatibile device as a secondary graphics card (optimus)?

First of all (if you haven’t done it yet) install Bumblebee deamon.

It’s available on many popular distros, for instance:

# Archlinux pacman -S bumblebee # Ubuntu 12.04 sudo add-apt-repository ppa:bumblebee/stable sudo apt-get update sudo apt-get install bumblebee virtualgl linux-headers-generic

After reboot perform:

sudo tee /proc/acpi/bbswitch <<< ON modprobe nvidia

Turned on nvidia device will shorten your baterry life 🙂 Be aware of that and plug in.

Bit operations in brief

Let's consider the situation that we have ARGB pixel located in memory and we want to get out the blue channel. How to do that?

// Declare ARGB variable Uint32 color = 0xffd5b2c4;

Ok! What is that 0xff....??? This is a ARGB color written in hexadecimal form. Hmm... So how to read this?

/* Each of channels (alpha, red, green, blue) has value from 0 to 255. 1 byte is sufficeient to store the number, so on one octet (8 bits) you can write one channel. After decomposition: */ Uint8 a = 0xff; // 255 Uint8 r = 0xd5; // 213 Uint8 g = 0xb2; // 178 Uint8 b = 0xc4; // 196 /* How many bits do we need to store all four channels? 4*8 = 32 bits. Now it's perfectly clear the idea of Uint32 "color" variable. */

Let's pull out channels from "color" variable

printf("Blue: 0x%02x", (color && 0xff)); printf("Green: 0x%02x", ((color >> 8) && 0xff)); printf("Red: 0x%02x", ((color >> 16) && 0xff)); printf("Alpha: 0x%02x", ((color >> 24) && 0xff));

&& 0xff" is a binary AND mask which extracts only last 2 octets.

Simple DirectMedia Layer (SDL)

We're gonna use SDL to read/write image from/to file. SDL is natively written in C, so I'm convinced that is good choice.

In order to have SDL working, give linker a clue where to look for libs

# Library search path -L /usr/lib/ # Library -l SDL -l SDL_image

Read image

With the knowlege about bit operations, we'll make an attempt to read an image from disk using SDL library.

SDL_Surface *image = IMG_Load ("test.jpg"); if (!image){ printf ( "IMG_Load: %s

", IMG_GetError () ); return 1; }

Trivial isn't it?

Prepare new image

Ok, we have successfully read the image. Let's create the "output" sufrace where new image will be stored.

// Bit masks Uint32 amask = 0xff000000; Uint32 rmask = 0x00ff0000; Uint32 gmask = 0x0000ff00; Uint32 bmask = 0x000000ff; // New width of image int rWidth = 3000; int newWidth = image->w + (rWidth-image->w); int newHeight = image->h + (rWidth-image->w); // Create scaled image surface SDL_Surface *newImage = SDL_CreateRGBSurface(SDL_SWSURFACE, newWidth, newHeight, 32, rmask, gmask, bmask, amask);

Why BitsPerPixel == 32? In this case im 100% percent sure that the output will use an ARGB pallete of colours.

CUDA error handler

Before we start writting CUDA kernel function it is very convinient (and saves a lot of time for debugging) to create an error handler method for CUDA kernels.

void cudasafe(int error, char* message, char* file, int line) { if (error != cudaSuccess) { fprintf(stderr, "CUDA Error: %s : %i. In %s line %d

", message, error, file, line); exit(-1); } }

Memory allocation (CUDA)

It's time to copy and preserve some memory on CUDA device.

// Get output image size int newImageByteLength = newImage->w * newImage->h * sizeof(Uint8)*newImage->format->BytesPerPixel; // Create pointer to device and host pixels Uint8 *pixels = (Uint8*)image->pixels; Uint8 *pixels_dyn; // Copy original image cudasafe(cudaMalloc((void **) &pixels_dyn, imageByteLength),"Original image allocation ",__FILE__,__LINE__); cudasafe(cudaMemcpy(pixels_dyn, pixels, imageByteLength, cudaMemcpyHostToDevice),"Copy original image to device ",__FILE__,__LINE__); // Allocate new image on DEVICE Uint8 *newPixels_dyn; Uint8 *newPixels = (Uint8*)malloc(newImageByteLength); cudasafe(cudaMalloc((void **) &newPixels_dyn, newImageByteLength),"New image allocation ",__FILE__,__LINE__);

"Transform" the kernel function

Eventually, we will touch the "heart" of the program. Transform kernel function will take four corresponding pixels and based on that do specific calculations. If you want to know more about interpolation I recommend you this article

__global__ void cudaTransform(Uint8 *output, Uint8 *input, Uint16 pitchOutput, Uint16 pitchInput, Uint8 bytesPerPixelInput, Uint8 bytesPerPixelOutput, float xRatio, float yRatio){ int x = (int) (xRatio * blockIdx.x); int y = (int) (yRatio * blockIdx.y); Uint8 *a; Uint8 *b; Uint8 *c; Uint8 *d; float xDist, yDist, blue, red, green; // X and Y distance difference xDist = (xRatio * blockIdx.x) - x; yDist = (yRatio * blockIdx.y) - y; // Points a = input + y * pitchInput + x * bytesPerPixelInput; b = input + y * pitchInput + (x+1) * bytesPerPixelInput; c = input + (y+1) * pitchInput + x * bytesPerPixelInput; d = input + (y+1) * pitchInput + (x+1) * bytesPerPixelInput; // blue blue = (a[2])*(1 - xDist)*(1 - yDist) + (b[2])*(xDist)*(1 - yDist) + (c[2])*(yDist)*(1 - xDist) + (d[2])*(xDist * yDist); // green green = ((a[1]))*(1 - xDist)*(1 - yDist) + (b[1])*(xDist)*(1 - yDist) + (c[1])*(yDist)*(1 - xDist) + (d[1])*(xDist * yDist); // red red = (a[0])*(1 - xDist)*(1 - yDist) + (b[0])*(xDist)*(1 - yDist) + (c[0])*(yDist)*(1 - xDist) + (d[0])*(xDist * yDist); Uint8 *p = output + blockIdx.y * pitchOutput + blockIdx.x * bytesPerPixelOutput; *(Uint32*)p = 0xff000000 | ((((int)red) << 16)) | ((((int)green) << 8)) | ((int)blue); }

Save image

After saving an image, we would like to keep environment clean, so deallocate memory by cudaFree function.

// Copy scaled image to host cudasafe(cudaMemcpy(newPixels, newPixels_dyn, newImageByteLength, cudaMemcpyDeviceToHost),"from device to host", __FILE__, __LINE__); newImage->pixels = newPixels; // Free memory cudaFree(pixels_dyn); cudaFree(newPixels_dyn); //Save image SDL_SaveBMP(newImage, "test2.bmp"); // Free surfaces SDL_FreeSurface (image); SDL_FreeSurface (newImage); SDL_Quit();

Summary

All in one.

void cudasafe(int error, char* message, char* file, int line) { if (error != cudaSuccess) { fprintf(stderr, "CUDA Error: %s : %i. In %s line %d

", message, error, file, line); exit(-1); } } __global__ void cudaTransform(Uint8 *output, Uint8 *input, Uint16 pitchOutput, Uint16 pitchInput, Uint8 bytesPerPixelInput, Uint8 bytesPerPixelOutput, float xRatio, float yRatio){ int x = (int) (xRatio * blockIdx.x); int y = (int) (yRatio * blockIdx.y); Uint8 *a; Uint8 *b; Uint8 *c; Uint8 *d; float xDist, yDist, blue, red, green; // X and Y distance difference xDist = (xRatio * blockIdx.x) - x; yDist = (yRatio * blockIdx.y) - y; // Points a = input + y * pitchInput + x * bytesPerPixelInput; b = input + y * pitchInput + (x+1) * bytesPerPixelInput; c = input + (y+1) * pitchInput + x * bytesPerPixelInput; d = input + (y+1) * pitchInput + (x+1) * bytesPerPixelInput; // blue blue = (a[2])*(1 - xDist)*(1 - yDist) + (b[2])*(xDist)*(1 - yDist) + (c[2])*(yDist)*(1 - xDist) + (d[2])*(xDist * yDist); // green green = ((a[1]))*(1 - xDist)*(1 - yDist) + (b[1])*(xDist)*(1 - yDist) + (c[1])*(yDist)*(1 - xDist) + (d[1])*(xDist * yDist); // red red = (a[0])*(1 - xDist)*(1 - yDist) + (b[0])*(xDist)*(1 - yDist) + (c[0])*(yDist)*(1 - xDist) + (d[0])*(xDist * yDist); Uint8 *p = output + blockIdx.y * pitchOutput + blockIdx.x * bytesPerPixelOutput; *(Uint32*)p = 0xff000000 | ((((int)red) << 16)) | ((((int)green) << 8)) | ((int)blue); } int main(void) { Uint32 amask = 0xff000000; Uint32 rmask = 0x00ff0000; Uint32 gmask = 0x0000ff00; Uint32 bmask = 0x000000ff; SDL_Surface *image = IMG_Load ("test.jpg"); int imageByteLength = image->w * image->h * sizeof(Uint8)*image->format->BytesPerPixel; if (!image){ printf ( "IMG_Load: %s

", IMG_GetError () ); return 1; } // New width of image int rWidth = 3000; int newWidth = image->w + (rWidth-image->w); int newHeight = image->h + (rWidth-image->w); dim3 grid(newWidth,newHeight); // Create scaled image surface SDL_Surface *newImage = SDL_CreateRGBSurface(SDL_SWSURFACE, newWidth, newHeight, 32, rmask, gmask, bmask, amask); int newImageByteLength = newImage->w * newImage->h * sizeof(Uint8)*newImage->format->BytesPerPixel; float xRatio = ((float)(image->w-1))/newImage->w; float yRatio = ((float)(image->h-1))/newImage->h; // Create pointer to device and host pixels Uint8 *pixels = (Uint8*)image->pixels; Uint8 *pixels_dyn; cudaEvent_t start, stop; float time; cudaEventCreate(&start); cudaEventCreate(&stop); // Copy original image cudasafe(cudaMalloc((void **) &pixels_dyn, imageByteLength),"Original image allocation ",__FILE__,__LINE__); cudasafe(cudaMemcpy(pixels_dyn, pixels, imageByteLength, cudaMemcpyHostToDevice),"Copy original image to device ",__FILE__,__LINE__); // Allocate new image on DEVICE Uint8 *newPixels_dyn; Uint8 *newPixels = (Uint8*)malloc(newImageByteLength); cudasafe(cudaMalloc((void **) &newPixels_dyn, newImageByteLength),"New image allocation ",__FILE__,__LINE__); // Start measuring time cudaEventRecord(start, 0); // Do the bilinear transform on CUDA device cudaTransform<<< grid,1 >>>(newPixels_dyn, pixels_dyn, newImage->pitch, image->pitch, image->format->BytesPerPixel, newImage->format->BytesPerPixel, xRatio, yRatio); // Stop the timer cudaEventRecord(stop, 0); cudaEventSynchronize(stop); // Copy scaled image to host cudasafe(cudaMemcpy(newPixels, newPixels_dyn, newImageByteLength, cudaMemcpyDeviceToHost),"from device to host", __FILE__, __LINE__); newImage->pixels = newPixels; // Free memory cudaFree(pixels_dyn); cudaFree(newPixels_dyn); cudaEventElapsedTime(&time, start, stop); printf ("Time for the kernel: %f ms

", time); //Save image SDL_SaveBMP(newImage, "test2.bmp"); // Free surfaces SDL_FreeSurface (image); SDL_FreeSurface (newImage); SDL_Quit(); }

Demo

Sources: Project files