diff --git a/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/Makefile b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/Makefile new file mode 100644 index 0000000..c9f21ea --- /dev/null +++ b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/Makefile @@ -0,0 +1,13 @@ + +BINARIES = $(basename $(wildcard v*pu.c*)) + +all: $(BINARIES) + +clean: + rm -f $(BINARIES) + +%-cpu: %-cpu.c + gcc -std=c99 -pedantic -Wextra -O3 $< -lm -o $@ + +%-gpu: %-gpu.cu + nvcc -gencode arch=compute_30,code=sm_30 -O3 $< -o $@ diff --git a/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/Ysoft-Tech-Hour.pdf b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/Ysoft-Tech-Hour.pdf new file mode 100644 index 0000000..236a2a0 Binary files /dev/null and b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/Ysoft-Tech-Hour.pdf differ diff --git a/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v01-cpu b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v01-cpu new file mode 100755 index 0000000..b3a647a Binary files /dev/null and b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v01-cpu differ diff --git a/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v01-cpu.c b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v01-cpu.c new file mode 100644 index 0000000..733b241 --- /dev/null +++ b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v01-cpu.c @@ -0,0 +1,138 @@ +#include +#include +#include +#include +#include + + +#define MAX_PATH_LEN (32 * 1024) +#define MAX_KERNEL_RADIUS 16 + +static void error(const char * message) { + fprintf(stderr, "ERROR: %s\n", message); + exit(-1); +} + +static void usage(const char * message, const char * app) { + fprintf(stderr, "Usage: %s width height sigma file1 ... fileN\n", app); + fprintf(stderr, "Example: %s 1920 1080 3 f1.gray f2.gray f3.gray\n", app); + error(message); +} + +static double timer_ms() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000.0 + tv.tv_usec * 0.001; +} + +static int saturate(int n, int max) { + return n < 0 ? 0 : (n < max ? n : max - 1); +} + +static int get_pix(const uint8_t * src, int w, int h, int x, int y) { + return src[saturate(x, w) + saturate(y, h) * w]; +} + +static float gaussian(float sigma, float x) { + const float e = x / sigma; + return exp(-0.5 * e * e); +} + +int main(int argn, char ** argv) { + if(argn < 4) { + usage("Wrong argument count", *argv); + } + + // read width and height + const int w = atoi(argv[1]); + const int h = atoi(argv[2]); + if(w < 1 || h < 1) { + usage("Both width and height must be positive integers", *argv); + } + const int pix_count = w * h; + + // read sigma and prepare normalized kernel (sum = 1) + const float sigma = atof(argv[3]); + float kernel[MAX_KERNEL_RADIUS + 1]; + float kernel_sum = 0.0f; + for(int k = 0; k <= MAX_KERNEL_RADIUS; k++) { + kernel_sum += kernel[k] = gaussian(sigma, k); + } + kernel_sum = 2.0 * kernel_sum - kernel[0]; + for(int k = 0; k <= MAX_KERNEL_RADIUS; k++) { + kernel[k] /= kernel_sum; + } + + // dump the kernel + printf("Convolution kernel:"); + for(int k = -MAX_KERNEL_RADIUS; k <= MAX_KERNEL_RADIUS; k++) { + printf(" %f", kernel[k < 0 ? -k : k]); + } + printf("\n"); + + // prepare buffers + uint8_t * const data_ptr = (uint8_t*)malloc(pix_count); + uint8_t * const temp_ptr = (uint8_t*)malloc(pix_count); + + // measure time of processing of all images + const double begin = timer_ms(); + for(int i = 4; i < argn; i++) { + // read input data + printf("Processing '%s'\n", argv[i]); + FILE * const src_file = fopen(argv[i], "rb"); + if(NULL == src_file || 1 != fread(data_ptr, pix_count, 1, src_file)) { + error(argv[i]); + } + fclose(src_file); + + // vertical pass: for each pixel + uint8_t * out_pix_ptr = temp_ptr; + for(int y = 0; y < h; y++) { + for(int x = 0; x < w; x++) { + // sum up all weighted neighbors and the pixel itself + float result = kernel[0] * get_pix(data_ptr, w, h, x, y); + for(int k = 1; k <= MAX_KERNEL_RADIUS; k++) { + result += kernel[k] * (get_pix(data_ptr, w, h, x, y + k) + + get_pix(data_ptr, w, h, x, y - k)); + } + *(out_pix_ptr++) = saturate((int)result, 256); + } + } + + // horizontal pass: for each pixel + out_pix_ptr = data_ptr; + for(int y = 0; y < h; y++) { + for(int x = 0; x < w; x++) { + // sum up all weighted neighbors and the pixel itself + float result = kernel[0] * get_pix(temp_ptr, w, h, x, y); + for(int k = 1; k <= MAX_KERNEL_RADIUS; k++) { + result += kernel[k] * (get_pix(temp_ptr, w, h, x + k, y) + + get_pix(temp_ptr, w, h, x - k, y)); + } + *(out_pix_ptr++) = saturate((int)result, 256); + } + } + + // compose output filename + char out_path[MAX_PATH_LEN + 1]; + snprintf(out_path, MAX_PATH_LEN, "%s.out.gray", argv[i]); + + // write data to output file + FILE * const out_file = fopen(out_path, "wb"); + if(NULL == out_file || 1 != fwrite(data_ptr, pix_count, 1, out_file)) { + error(out_path); + } + fclose(out_file); + } + const double end = timer_ms(); + + // print total time + printf("time: %f ms, %d images => %f ms/image\n", + end - begin, argn - 4, (end - begin) / (argn - 4)); + + // cleanup + free(temp_ptr); + free(data_ptr); + return 0; +} + diff --git a/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v02-gpu.cu b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v02-gpu.cu new file mode 100644 index 0000000..b6082e4 --- /dev/null +++ b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v02-gpu.cu @@ -0,0 +1,157 @@ +#include +#include +#include +#include +#include + + +#define MAX_PATH_LEN (32 * 1024) +#define MAX_KERNEL_RADIUS 16 + +struct kernel_params { + float kernel[MAX_KERNEL_RADIUS + 1]; + int w; + int h; +}; + +static void error(const char * message) { + fprintf(stderr, "ERROR: %s\n", message); + exit(-1); +} + +static void usage(const char * message, const char * app) { + fprintf(stderr, "Usage: %s width height sigma file1 ... fileN\n", app); + fprintf(stderr, "Example: %s 1920 1080 3 f1.gray f2.gray f3.gray\n", app); + error(message); +} + +static double timer_ms() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000.0 + tv.tv_usec * 0.001; +} + +static __device__ int saturate(int n, int max_value) { + return max(0, min(n, max_value - 1)); +} + +static __device__ int get_pix(const uint8_t * src, int w, int h, int x, int y) { + return (float)src[saturate(x, w) + saturate(y, h) * w]; +} + +template +static __global__ void convolution(kernel_params p, uint8_t * src, uint8_t * dest) { + // coordinates of processed pixel + const int x = threadIdx.x + blockIdx.x * blockDim.x; + const int y = threadIdx.y + blockIdx.y * blockDim.y; + + // stop if out of bounds + if(x >= p.w || y >= p.h) { + return; + } + + // get weighted sum of neighbors + float result = p.kernel[0] * get_pix(src, p.w, p.h, x, y); + for(int k = 1; k <= MAX_KERNEL_RADIUS; k++) { + result += p.kernel[k] * (get_pix(src, p.w, p.h, x + k * DX, y + k * DY) + + get_pix(src, p.w, p.h, x - k * DX, y - k * DY)); + } + + // save result + dest[x + y * p.w] = saturate((int)result, 256); +} + + +static float gaussian(float sigma, float x) { + const float e = x / sigma; + return exp(-0.5 * e * e); +} + +int main(int argn, char ** argv) { + kernel_params params; + + if(argn < 4) { + usage("Wrong argument count", *argv); + } + + // read width and height + params.w = atoi(argv[1]); + params.h = atoi(argv[2]); + if(params.w < 1 || params.h < 1) { + usage("Both width and height must be positive integers", *argv); + } + const int pix_count = params.w * params.h; + + // read sigma and prepare normalized kernel (sum = 1) + const float sigma = atof(argv[3]); + float kernel_sum = 0.0f; + for(int k = 0; k <= MAX_KERNEL_RADIUS; k++) { + kernel_sum += params.kernel[k] = gaussian(sigma, k); + } + kernel_sum = 2.0 * kernel_sum - params.kernel[0]; + for(int k = 0; k <= MAX_KERNEL_RADIUS; k++) { + params.kernel[k] /= kernel_sum; + } + + // dump the kernel + printf("Convolution kernel:"); + for(int k = -MAX_KERNEL_RADIUS; k <= MAX_KERNEL_RADIUS; k++) { + printf(" %f", params.kernel[k < 0 ? -k : k]); + } + printf("\n"); + + // prepare buffers + uint8_t * const data_ptr = (uint8_t*)malloc(pix_count); + uint8_t * data_gpu_ptr; + uint8_t * temp_gpu_ptr; + cudaMalloc((void**)&data_gpu_ptr, pix_count); + cudaMalloc((void**)&temp_gpu_ptr, pix_count); + + // measure time of processing of all images + const double begin = timer_ms(); + for(int i = 4; i < argn; i++) { + // read input data + printf("Processing '%s'\n", argv[i]); + FILE * const src_file = fopen(argv[i], "rb"); + if(NULL == src_file || 1 != fread(data_ptr, pix_count, 1, src_file)) { + error(argv[i]); + } + fclose(src_file); + + // copy data to GPU memory + cudaMemcpy(data_gpu_ptr, data_ptr, pix_count, cudaMemcpyHostToDevice); + + // launch vertical and horizontal pass + dim3 block(32, 32); + dim3 grid((params.w + block.x - 1) / block.x, + (params.h + block.y - 1) / block.y); + convolution<0, 1><<>>(params, data_gpu_ptr, temp_gpu_ptr); + convolution<1, 0><<>>(params, temp_gpu_ptr, data_gpu_ptr); + + // copy data back from GPU + cudaMemcpy(data_ptr, data_gpu_ptr, pix_count, cudaMemcpyDeviceToHost); + + // compose output filename + char out_path[MAX_PATH_LEN + 1]; + snprintf(out_path, MAX_PATH_LEN, "%s.out.gray", argv[i]); + + // write data to output file + FILE * const out_file = fopen(out_path, "wb"); + if(NULL == out_file || 1 != fwrite(data_ptr, pix_count, 1, out_file)) { + error(out_path); + } + fclose(out_file); + } + const double end = timer_ms(); + + // print total time + printf("time: %f ms, %d images => %f ms/image\n", + end - begin, argn - 4, (end - begin) / (argn - 4)); + + // cleanup + free(data_ptr); + cudaFree(data_gpu_ptr); + cudaFree(temp_gpu_ptr); + return 0; +} + diff --git a/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v03-cpu.c b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v03-cpu.c new file mode 100644 index 0000000..d40ca0e --- /dev/null +++ b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v03-cpu.c @@ -0,0 +1,139 @@ +#include +#include +#include +#include +#include + + +#define MAX_PATH_LEN (32 * 1024) +#define MAX_KERNEL_RADIUS 16 + +static void error(const char * message) { + fprintf(stderr, "ERROR: %s\n", message); + exit(-1); +} + +static void usage(const char * message, const char * app) { + fprintf(stderr, "Usage: %s width height sigma file1 ... fileN\n", app); + fprintf(stderr, "Example: %s 1920 1080 3 f1.gray f2.gray f3.gray\n", app); + error(message); +} + +static double timer_ms() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000.0 + tv.tv_usec * 0.001; +} + +static int saturate(int n, int max) { + return n < 0 ? 0 : (n < max ? n : max - 1); +} + +static int get_pix(const uint8_t * src, int w, int h, int x, int y) { + return src[saturate(x, w) + saturate(y, h) * w]; +} + +static float gaussian(float sigma, float x) { + const float e = x / sigma; + return exp(-0.5 * e * e); +} + +int main(int argn, char ** argv) { + if(argn < 4) { + usage("Wrong argument count", *argv); + } + + // read width and height + const int w = atoi(argv[1]); + const int h = atoi(argv[2]); + if(w < 1 || h < 1) { + usage("Both width and height must be positive integers", *argv); + } + const int pix_count = w * h; + + // read sigma and prepare normalized kernel (sum = 1) + const float sigma = atof(argv[3]); + float kernel[MAX_KERNEL_RADIUS + 1]; + float kernel_sum = 0.0f; + for(int k = 0; k <= MAX_KERNEL_RADIUS; k++) { + kernel_sum += kernel[k] = gaussian(sigma, k); + } + kernel_sum = 2.0 * kernel_sum - kernel[0]; + for(int k = 0; k <= MAX_KERNEL_RADIUS; k++) { + kernel[k] /= kernel_sum; + } + + // dump the kernel + printf("Convolution kernel:"); + for(int k = -MAX_KERNEL_RADIUS; k <= MAX_KERNEL_RADIUS; k++) { + printf(" %f", kernel[k < 0 ? -k : k]); + } + printf("\n"); + + // prepare buffers + uint8_t * const data_ptr = (uint8_t*)malloc(pix_count); + uint8_t * const temp_ptr = (uint8_t*)malloc(pix_count); + + // measure time of processing of all images + const double begin = timer_ms(); + #pragma omp parallel for + for(int i = 4; i < argn; i++) { + // read input data + printf("Processing '%s'\n", argv[i]); + FILE * const src_file = fopen(argv[i], "rb"); + if(NULL == src_file || 1 != fread(data_ptr, pix_count, 1, src_file)) { + error(argv[i]); + } + fclose(src_file); + + // vertical pass: for each pixel + uint8_t * out_pix_ptr = temp_ptr; + for(int y = 0; y < h; y++) { + for(int x = 0; x < w; x++) { + // sum up all weighted neighbors and the pixel itself + float result = kernel[0] * get_pix(data_ptr, w, h, x, y); + for(int k = 1; k <= MAX_KERNEL_RADIUS; k++) { + result += kernel[k] * (get_pix(data_ptr, w, h, x, y + k) + + get_pix(data_ptr, w, h, x, y - k)); + } + *(out_pix_ptr++) = saturate((int)result, 256); + } + } + + // horizontal pass: for each pixel + out_pix_ptr = data_ptr; + for(int y = 0; y < h; y++) { + for(int x = 0; x < w; x++) { + // sum up all weighted neighbors and the pixel itself + float result = kernel[0] * get_pix(temp_ptr, w, h, x, y); + for(int k = 1; k <= MAX_KERNEL_RADIUS; k++) { + result += kernel[k] * (get_pix(temp_ptr, w, h, x + k, y) + + get_pix(temp_ptr, w, h, x - k, y)); + } + *(out_pix_ptr++) = saturate((int)result, 256); + } + } + + // compose output filename + char out_path[MAX_PATH_LEN + 1]; + snprintf(out_path, MAX_PATH_LEN, "%s.out.gray", argv[i]); + + // write data to output file + FILE * const out_file = fopen(out_path, "wb"); + if(NULL == out_file || 1 != fwrite(data_ptr, pix_count, 1, out_file)) { + error(out_path); + } + fclose(out_file); + } + const double end = timer_ms(); + + // print total time + printf("time: %f ms, %d images => %f ms/image\n", + end - begin, argn - 4, (end - begin) / (argn - 4)); + + // cleanup + free(temp_ptr); + free(data_ptr); + return 0; +} + diff --git a/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v04-gpu.cu b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v04-gpu.cu new file mode 100644 index 0000000..66e81ba --- /dev/null +++ b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v04-gpu.cu @@ -0,0 +1,205 @@ +#include +#include +#include +#include +#include + + +#define MAX_PATH_LEN (32 * 1024) +#define MAX_KERNEL_RADIUS 16 + +// thread block size +#define TX 32 +#define TY 32 + +struct kernel_params { + float kernel[MAX_KERNEL_RADIUS + 1]; + int w; + int h; +}; + +static void error(const char * message) { + fprintf(stderr, "ERROR: %s\n", message); + exit(-1); +} + +static void usage(const char * message, const char * app) { + fprintf(stderr, "Usage: %s width height sigma file1 ... fileN\n", app); + fprintf(stderr, "Example: %s 1920 1080 3 f1.gray f2.gray f3.gray\n", app); + error(message); +} + +static double timer_ms() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000.0 + tv.tv_usec * 0.001; +} + +static __device__ int saturate(int n, int max_value) { + return max(0, min(n, max_value - 1)); +} + +static __device__ int get_pix(const uint8_t * src, int w, int h, int x, int y) { + return (float)src[saturate(x, w) + saturate(y, h) * w]; +} + +static __global__ void convolution_vertical(kernel_params p, uint8_t * src, uint8_t * dest) { + // coordinates of pixel processed by this thread + const int x = threadIdx.x + blockIdx.x * TX; + const int y = threadIdx.y + blockIdx.y * TY; + + // shared cache for processed pixels + __shared__ float cache[TY + 2 * MAX_KERNEL_RADIUS][TX]; + + // all threads populate shared cache + for(int ny = 0; ny < 2; ny++) { + cache[threadIdx.y + ny * TY][threadIdx.x] + = get_pix(src, p.w, p.h, x, y - MAX_KERNEL_RADIUS + ny * TY); + } + + // wait for all threads of block to finish their contribution to cache + __syncthreads(); + + // stop this thread if out of bounds + if(x >= p.w || y >= p.h) { + return; + } + + // get weighted sum of neighbors + float result = p.kernel[0] * cache[MAX_KERNEL_RADIUS + threadIdx.y][threadIdx.x]; + for(int k = 1; k <= MAX_KERNEL_RADIUS; k++) { + result += p.kernel[k] * cache[MAX_KERNEL_RADIUS + threadIdx.y - k][threadIdx.x]; + result += p.kernel[k] * cache[MAX_KERNEL_RADIUS + threadIdx.y + k][threadIdx.x]; + } + + // save result + dest[x + y * p.w] = saturate((int)result, 256); +} + + +static __global__ void convolution_horizontal(kernel_params p, uint8_t * src, uint8_t * dest) { + // coordinates of pixel processed by this thread + const int x = threadIdx.x + blockIdx.x * TX; + const int y = threadIdx.y + blockIdx.y * TY; + + // shared cache for processed pixels + __shared__ float cache[TY][TX + 2 * MAX_KERNEL_RADIUS]; + + // all threads populate shared cache + for(int nx = 0; nx < 2; nx++) { + cache[threadIdx.y][threadIdx.x + nx * TX] + = get_pix(src, p.w, p.h, x - MAX_KERNEL_RADIUS + nx * TX, y); + } + + // wait for all threads of block to finish their contribution to cache + __syncthreads(); + + // stop this thread if out of bounds + if(x >= p.w || y >= p.h) { + return; + } + + // get weighted sum of neighbors + float result = p.kernel[0] * cache[threadIdx.y][MAX_KERNEL_RADIUS + threadIdx.x]; + for(int k = 1; k <= MAX_KERNEL_RADIUS; k++) { + result += p.kernel[k] * cache[threadIdx.y][MAX_KERNEL_RADIUS + threadIdx.x + k]; + result += p.kernel[k] * cache[threadIdx.y][MAX_KERNEL_RADIUS + threadIdx.x - k]; + } + + // save result + dest[x + y * p.w] = saturate((int)result, 256); +} + + +static float gaussian(float sigma, float x) { + const float e = x / sigma; + return exp(-0.5 * e * e); +} + +int main(int argn, char ** argv) { + kernel_params params; + + if(argn < 4) { + usage("Wrong argument count", *argv); + } + + // read width and height + params.w = atoi(argv[1]); + params.h = atoi(argv[2]); + if(params.w < 1 || params.h < 1) { + usage("Both width and height must be positive integers", *argv); + } + const int pix_count = params.w * params.h; + + // read sigma and prepare normalized kernel (sum = 1) + const float sigma = atof(argv[3]); + float kernel_sum = 0.0f; + for(int k = 0; k <= MAX_KERNEL_RADIUS; k++) { + kernel_sum += params.kernel[k] = gaussian(sigma, k); + } + kernel_sum = 2.0 * kernel_sum - params.kernel[0]; + for(int k = 0; k <= MAX_KERNEL_RADIUS; k++) { + params.kernel[k] /= kernel_sum; + } + + // dump the kernel + printf("Convolution kernel:"); + for(int k = -MAX_KERNEL_RADIUS; k <= MAX_KERNEL_RADIUS; k++) { + printf(" %f", params.kernel[k < 0 ? -k : k]); + } + printf("\n"); + + // prepare buffers + uint8_t * const data_ptr = (uint8_t*)malloc(pix_count); + uint8_t * data_gpu_ptr; + uint8_t * temp_gpu_ptr; + cudaMalloc((void**)&data_gpu_ptr, pix_count); + cudaMalloc((void**)&temp_gpu_ptr, pix_count); + + // measure time of processing of all images + const double begin = timer_ms(); + for(int i = 4; i < argn; i++) { + // read input data + printf("Processing '%s'\n", argv[i]); + FILE * const src_file = fopen(argv[i], "rb"); + if(NULL == src_file || 1 != fread(data_ptr, pix_count, 1, src_file)) { + error(argv[i]); + } + fclose(src_file); + + // copy data to GPU memory + cudaMemcpy(data_gpu_ptr, data_ptr, pix_count, cudaMemcpyHostToDevice); + + // launch vertical and horizontal pass + dim3 block(TX, TY); + dim3 grid((params.w + TX - 1) / TX, (params.h + TY - 1) / TY); + convolution_vertical<<>>(params, data_gpu_ptr, temp_gpu_ptr); + convolution_horizontal<<>>(params, temp_gpu_ptr, data_gpu_ptr); + + // copy data back from GPU + cudaMemcpy(data_ptr, data_gpu_ptr, pix_count, cudaMemcpyDeviceToHost); + + // compose output filename + char out_path[MAX_PATH_LEN + 1]; + snprintf(out_path, MAX_PATH_LEN, "%s.out.gray", argv[i]); + + // write data to output file + FILE * const out_file = fopen(out_path, "wb"); + if(NULL == out_file || 1 != fwrite(data_ptr, pix_count, 1, out_file)) { + error(out_path); + } + fclose(out_file); + } + const double end = timer_ms(); + + // print total time + printf("time: %f ms, %d images => %f ms/image\n", + end - begin, argn - 4, (end - begin) / (argn - 4)); + + // cleanup + free(data_ptr); + cudaFree(data_gpu_ptr); + cudaFree(temp_gpu_ptr); + return 0; +} + diff --git a/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v05-gpu.cu b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v05-gpu.cu new file mode 100644 index 0000000..868b8e5 --- /dev/null +++ b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v05-gpu.cu @@ -0,0 +1,243 @@ +#include +#include +#include +#include +#include + + +#define MAX_PATH_LEN (32 * 1024) +#define MAX_KERNEL_RADIUS 16 + +// thread block size +#define TX 32 +#define TY 32 + +struct kernel_params { + float kernel[MAX_KERNEL_RADIUS + 1]; + int w; + int h; +}; + +static void error(const char * message) { + fprintf(stderr, "ERROR: %s\n", message); + exit(-1); +} + +static void usage(const char * message, const char * app) { + fprintf(stderr, "Usage: %s width height sigma file1 ... fileN\n", app); + fprintf(stderr, "Example: %s 1920 1080 3 f1.gray f2.gray f3.gray\n", app); + error(message); +} + +static double timer_ms() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000.0 + tv.tv_usec * 0.001; +} + +static __device__ int saturate(int n, int max_value) { + return max(0, min(n, max_value - 1)); +} + +static __device__ int get_pix(const uint8_t * src, int w, int h, int x, int y) { + return (float)src[saturate(x, w) + saturate(y, h) * w]; +} + +static __global__ void convolution_vertical(kernel_params p, uint8_t * src, uint8_t * dest) { + // coordinates of pixel processed by this thread + const int x = threadIdx.x + blockIdx.x * TX; + const int y = threadIdx.y + blockIdx.y * TY; + + // shared cache for processed pixels + __shared__ float cache[TY + 2 * MAX_KERNEL_RADIUS][TX]; + + // all threads populate shared cache + for(int ny = 0; ny < 2; ny++) { + cache[threadIdx.y + ny * TY][threadIdx.x] + = get_pix(src, p.w, p.h, x, y - MAX_KERNEL_RADIUS + ny * TY); + } + + // wait for all threads of block to finish their contribution to cache + __syncthreads(); + + // stop this thread if out of bounds + if(x >= p.w || y >= p.h) { + return; + } + + // get weighted sum of neighbors + float result = p.kernel[0] * cache[MAX_KERNEL_RADIUS + threadIdx.y][threadIdx.x]; + for(int k = 1; k <= MAX_KERNEL_RADIUS; k++) { + result += p.kernel[k] * cache[MAX_KERNEL_RADIUS + threadIdx.y - k][threadIdx.x]; + result += p.kernel[k] * cache[MAX_KERNEL_RADIUS + threadIdx.y + k][threadIdx.x]; + } + + // save result + dest[x + y * p.w] = saturate((int)result, 256); +} + + +static __global__ void convolution_horizontal(kernel_params p, uint8_t * src, uint8_t * dest) { + // coordinates of pixel processed by this thread + const int x = threadIdx.x + blockIdx.x * TX; + const int y = threadIdx.y + blockIdx.y * TY; + + // shared cache for processed pixels + __shared__ float cache[TY][TX + 2 * MAX_KERNEL_RADIUS]; + + // all threads populate shared cache + for(int nx = 0; nx < 2; nx++) { + cache[threadIdx.y][threadIdx.x + nx * TX] + = get_pix(src, p.w, p.h, x - MAX_KERNEL_RADIUS + nx * TX, y); + } + + // wait for all threads of block to finish their contribution to cache + __syncthreads(); + + // stop this thread if out of bounds + if(x >= p.w || y >= p.h) { + return; + } + + // get weighted sum of neighbors + float result = p.kernel[0] * cache[threadIdx.y][MAX_KERNEL_RADIUS + threadIdx.x]; + for(int k = 1; k <= MAX_KERNEL_RADIUS; k++) { + result += p.kernel[k] * cache[threadIdx.y][MAX_KERNEL_RADIUS + threadIdx.x + k]; + result += p.kernel[k] * cache[threadIdx.y][MAX_KERNEL_RADIUS + threadIdx.x - k]; + } + + // save result + dest[x + y * p.w] = saturate((int)result, 256); +} + + +static float gaussian(float sigma, float x) { + const float e = x / sigma; + return exp(-0.5 * e * e); +} + +int main(int argn, char ** argv) { + kernel_params params; + + if(argn < 4) { + usage("Wrong argument count", *argv); + } + + // read width and height + params.w = atoi(argv[1]); + params.h = atoi(argv[2]); + if(params.w < 1 || params.h < 1) { + usage("Both width and height must be positive integers", *argv); + } + const int pix_count = params.w * params.h; + + // read sigma and prepare normalized kernel (sum = 1) + const float sigma = atof(argv[3]); + float kernel_sum = 0.0f; + for(int k = 0; k <= MAX_KERNEL_RADIUS; k++) { + kernel_sum += params.kernel[k] = gaussian(sigma, k); + } + kernel_sum = 2.0 * kernel_sum - params.kernel[0]; + for(int k = 0; k <= MAX_KERNEL_RADIUS; k++) { + params.kernel[k] /= kernel_sum; + } + + // dump the kernel + printf("Convolution kernel:"); + for(int k = -MAX_KERNEL_RADIUS; k <= MAX_KERNEL_RADIUS; k++) { + printf(" %f", params.kernel[k < 0 ? -k : k]); + } + printf("\n"); + + // prepare buffers + uint8_t * const data_ptr = (uint8_t*)malloc(pix_count); + uint8_t * data_gpu_ptr[2]; + uint8_t * temp_gpu_ptr[2]; + cudaMalloc((void**)(data_gpu_ptr + 0), pix_count); + cudaMalloc((void**)(temp_gpu_ptr + 0), pix_count); + cudaMalloc((void**)(data_gpu_ptr + 1), pix_count); + cudaMalloc((void**)(temp_gpu_ptr + 1), pix_count); + + // two CUDA streams for asynchronous kernel and data transfers + cudaStream_t streams[2]; + cudaStreamCreate(streams + 0); + cudaStreamCreate(streams + 1); + + // measure time of processing of all images + const double begin = timer_ms(); + for(int i = 3; i <= argn; i++) { + // index of I/O buffers in this iteration + const int io_idx = i & 1; + + // index of computing resources in this iteration + const int comp_idx = io_idx ^ 1; + + // start processing of image loaded in previous iteration + // (except of first and last iteration) + if(i > 3 && i < argn) { + // launch vertical and horizontal pass + dim3 block(TX, TY); + dim3 grid((params.w + TX - 1) / TX, (params.h + TY - 1) / TY); + convolution_vertical<<>> + (params, data_gpu_ptr[comp_idx], temp_gpu_ptr[comp_idx]); + convolution_horizontal<<>> + (params, temp_gpu_ptr[comp_idx], data_gpu_ptr[comp_idx]); + } + + // processing now runs asynchronously on the GPU => save reauls + // from previous iteration (except of two first iterations) + if(i > 4) { + // copy data back from GPU + cudaMemcpyAsync(data_ptr, data_gpu_ptr[io_idx], pix_count, + cudaMemcpyDeviceToHost, streams[io_idx]); + + // compose output filename + char out_path[MAX_PATH_LEN + 1]; + snprintf(out_path, MAX_PATH_LEN, "%s.out.gray", argv[i - 1]); + + // wait for the data to actually appear in the buffer + cudaStreamSynchronize(streams[io_idx]); + + // write data to output file + FILE * const out_file = fopen(out_path, "wb"); + if(NULL == out_file || 1 != fwrite(data_ptr, pix_count, 1, out_file)) { + error(out_path); + } + fclose(out_file); + } + + // load input for next iteration (except of two last iterations) + if(i < (argn - 1)) { + // read input file + printf("Processing '%s'\n", argv[i + 1]); + FILE * const src_file = fopen(argv[i + 1], "rb"); + if(NULL == src_file || 1 != fread(data_ptr, pix_count, 1, src_file)) { + error(argv[i + 1]); + } + fclose(src_file); + + // copy data to GPU memory + cudaMemcpyAsync(data_gpu_ptr[io_idx], data_ptr, pix_count, + cudaMemcpyHostToDevice, streams[io_idx]); + + // make sure that the buffer is ready for next iteration + cudaStreamSynchronize(streams[io_idx]); + } + } + const double end = timer_ms(); + + // print total time + printf("time: %f ms, %d images => %f ms/image\n", + end - begin, argn - 4, (end - begin) / (argn - 4)); + + // cleanup + free(data_ptr); + cudaStreamDestroy(streams[0]); + cudaStreamDestroy(streams[1]); + cudaFree(data_gpu_ptr[0]); + cudaFree(temp_gpu_ptr[0]); + cudaFree(data_gpu_ptr[1]); + cudaFree(temp_gpu_ptr[1]); + return 0; +} + diff --git a/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v06-gpu.cu b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v06-gpu.cu new file mode 100644 index 0000000..8e79e0e --- /dev/null +++ b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v06-gpu.cu @@ -0,0 +1,244 @@ +#include +#include +#include +#include +#include + + +#define MAX_PATH_LEN (32 * 1024) +#define MAX_KERNEL_RADIUS 16 + +// thread block size +#define TX 32 +#define TY 32 + +struct kernel_params { + float kernel[MAX_KERNEL_RADIUS + 1]; + int w; + int h; +}; + +static void error(const char * message) { + fprintf(stderr, "ERROR: %s\n", message); + exit(-1); +} + +static void usage(const char * message, const char * app) { + fprintf(stderr, "Usage: %s width height sigma file1 ... fileN\n", app); + fprintf(stderr, "Example: %s 1920 1080 3 f1.gray f2.gray f3.gray\n", app); + error(message); +} + +static double timer_ms() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000.0 + tv.tv_usec * 0.001; +} + +static __device__ int saturate(int n, int max_value) { + return max(0, min(n, max_value - 1)); +} + +static __device__ int get_pix(const uint8_t * src, int w, int h, int x, int y) { + return (float)src[saturate(x, w) + saturate(y, h) * w]; +} + +static __global__ void convolution_vertical(kernel_params p, uint8_t * src, uint8_t * dest) { + // coordinates of pixel processed by this thread + const int x = threadIdx.x + blockIdx.x * TX; + const int y = threadIdx.y + blockIdx.y * TY; + + // shared cache for processed pixels + __shared__ float cache[TY + 2 * MAX_KERNEL_RADIUS][TX]; + + // all threads populate shared cache + for(int ny = 0; ny < 2; ny++) { + cache[threadIdx.y + ny * TY][threadIdx.x] + = get_pix(src, p.w, p.h, x, y - MAX_KERNEL_RADIUS + ny * TY); + } + + // wait for all threads of block to finish their contribution to cache + __syncthreads(); + + // stop this thread if out of bounds + if(x >= p.w || y >= p.h) { + return; + } + + // get weighted sum of neighbors + float result = p.kernel[0] * cache[MAX_KERNEL_RADIUS + threadIdx.y][threadIdx.x]; + for(int k = 1; k <= MAX_KERNEL_RADIUS; k++) { + result += p.kernel[k] * cache[MAX_KERNEL_RADIUS + threadIdx.y - k][threadIdx.x]; + result += p.kernel[k] * cache[MAX_KERNEL_RADIUS + threadIdx.y + k][threadIdx.x]; + } + + // save result + dest[x + y * p.w] = saturate((int)result, 256); +} + + +static __global__ void convolution_horizontal(kernel_params p, uint8_t * src, uint8_t * dest) { + // coordinates of pixel processed by this thread + const int x = threadIdx.x + blockIdx.x * TX; + const int y = threadIdx.y + blockIdx.y * TY; + + // shared cache for processed pixels + __shared__ float cache[TY][TX + 2 * MAX_KERNEL_RADIUS]; + + // all threads populate shared cache + for(int nx = 0; nx < 2; nx++) { + cache[threadIdx.y][threadIdx.x + nx * TX] + = get_pix(src, p.w, p.h, x - MAX_KERNEL_RADIUS + nx * TX, y); + } + + // wait for all threads of block to finish their contribution to cache + __syncthreads(); + + // stop this thread if out of bounds + if(x >= p.w || y >= p.h) { + return; + } + + // get weighted sum of neighbors + float result = p.kernel[0] * cache[threadIdx.y][MAX_KERNEL_RADIUS + threadIdx.x]; + for(int k = 1; k <= MAX_KERNEL_RADIUS; k++) { + result += p.kernel[k] * cache[threadIdx.y][MAX_KERNEL_RADIUS + threadIdx.x + k]; + result += p.kernel[k] * cache[threadIdx.y][MAX_KERNEL_RADIUS + threadIdx.x - k]; + } + + // save result + dest[x + y * p.w] = saturate((int)result, 256); +} + + +static float gaussian(float sigma, float x) { + const float e = x / sigma; + return exp(-0.5 * e * e); +} + +int main(int argn, char ** argv) { + kernel_params params; + + if(argn < 4) { + usage("Wrong argument count", *argv); + } + + // read width and height + params.w = atoi(argv[1]); + params.h = atoi(argv[2]); + if(params.w < 1 || params.h < 1) { + usage("Both width and height must be positive integers", *argv); + } + const int pix_count = params.w * params.h; + + // read sigma and prepare normalized kernel (sum = 1) + const float sigma = atof(argv[3]); + float kernel_sum = 0.0f; + for(int k = 0; k <= MAX_KERNEL_RADIUS; k++) { + kernel_sum += params.kernel[k] = gaussian(sigma, k); + } + kernel_sum = 2.0 * kernel_sum - params.kernel[0]; + for(int k = 0; k <= MAX_KERNEL_RADIUS; k++) { + params.kernel[k] /= kernel_sum; + } + + // dump the kernel + printf("Convolution kernel:"); + for(int k = -MAX_KERNEL_RADIUS; k <= MAX_KERNEL_RADIUS; k++) { + printf(" %f", params.kernel[k < 0 ? -k : k]); + } + printf("\n"); + + // prepare buffers + uint8_t * data_ptr; + uint8_t * data_gpu_ptr[2]; + uint8_t * temp_gpu_ptr[2]; + cudaMalloc((void**)(data_gpu_ptr + 0), pix_count); + cudaMalloc((void**)(temp_gpu_ptr + 0), pix_count); + cudaMalloc((void**)(data_gpu_ptr + 1), pix_count); + cudaMalloc((void**)(temp_gpu_ptr + 1), pix_count); + cudaMallocHost((void**)&data_ptr, pix_count, cudaHostAllocDefault); + + // two CUDA streams for asynchronous kernel and data transfers + cudaStream_t streams[2]; + cudaStreamCreate(streams + 0); + cudaStreamCreate(streams + 1); + + // measure time of processing of all images + const double begin = timer_ms(); + for(int i = 3; i <= argn; i++) { + // index of I/O buffers in this iteration + const int io_idx = i & 1; + + // index of computing resources in this iteration + const int comp_idx = io_idx ^ 1; + + // start processing of image loaded in previous iteration + // (except of first and last iteration) + if(i > 3 && i < argn) { + // launch vertical and horizontal pass + dim3 block(TX, TY); + dim3 grid((params.w + TX - 1) / TX, (params.h + TY - 1) / TY); + convolution_vertical<<>> + (params, data_gpu_ptr[comp_idx], temp_gpu_ptr[comp_idx]); + convolution_horizontal<<>> + (params, temp_gpu_ptr[comp_idx], data_gpu_ptr[comp_idx]); + } + + // processing now runs asynchronously on the GPU => save reauls + // from previous iteration (except of two first iterations) + if(i > 4) { + // copy data back from GPU + cudaMemcpyAsync(data_ptr, data_gpu_ptr[io_idx], pix_count, + cudaMemcpyDeviceToHost, streams[io_idx]); + + // compose output filename + char out_path[MAX_PATH_LEN + 1]; + snprintf(out_path, MAX_PATH_LEN, "%s.out.gray", argv[i - 1]); + + // wait for the data to actually appear in the buffer + cudaStreamSynchronize(streams[io_idx]); + + // write data to output file + FILE * const out_file = fopen(out_path, "wb"); + if(NULL == out_file || 1 != fwrite(data_ptr, pix_count, 1, out_file)) { + error(out_path); + } + fclose(out_file); + } + + // load input for next iteration (except of two last iterations) + if(i < (argn - 1)) { + // read input file + printf("Processing '%s'\n", argv[i + 1]); + FILE * const src_file = fopen(argv[i + 1], "rb"); + if(NULL == src_file || 1 != fread(data_ptr, pix_count, 1, src_file)) { + error(argv[i + 1]); + } + fclose(src_file); + + // copy data to GPU memory + cudaMemcpyAsync(data_gpu_ptr[io_idx], data_ptr, pix_count, + cudaMemcpyHostToDevice, streams[io_idx]); + + // make sure that the buffer is ready for next iteration + cudaStreamSynchronize(streams[io_idx]); + } + } + const double end = timer_ms(); + + // print total time + printf("time: %f ms, %d images => %f ms/image\n", + end - begin, argn - 4, (end - begin) / (argn - 4)); + + // cleanup + cudaStreamDestroy(streams[0]); + cudaStreamDestroy(streams[1]); + cudaFree(data_gpu_ptr[0]); + cudaFree(temp_gpu_ptr[0]); + cudaFree(data_gpu_ptr[1]); + cudaFree(temp_gpu_ptr[1]); + cudaFreeHost(data_ptr); + return 0; +} + diff --git a/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v07-gpu.cu b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v07-gpu.cu new file mode 100644 index 0000000..3ae1a71 --- /dev/null +++ b/2014_06_19_Comprimato_Ultimate_Commodity_supercomputer/v07-gpu.cu @@ -0,0 +1,276 @@ +#include +#include +#include +#include +#include + + +#define MAX_PATH_LEN (32 * 1024) +#define MAX_KERNEL_RADIUS 16 +#define KERNEL_SIZE (1 + 2 * (MAX_KERNEL_RADIUS)) + +// thread block size +#define TX 32 +#define TY 32 +#define TX_V 128 +#define TY_V 64 + + +struct kernel_params { + float kernel[MAX_KERNEL_RADIUS + 1]; + int w; + int h; +}; + +static void error(const char * message) { + fprintf(stderr, "ERROR: %s\n", message); + exit(-1); +} + +static void usage(const char * message, const char * app) { + fprintf(stderr, "Usage: %s width height sigma file1 ... fileN\n", app); + fprintf(stderr, "Example: %s 1920 1080 3 f1.gray f2.gray f3.gray\n", app); + error(message); +} + +static double timer_ms() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000.0 + tv.tv_usec * 0.001; +} + +static __device__ int saturate(int n, int max_value) { + return max(0, min(n, max_value - 1)); +} + +static __device__ int get_pix(const uint8_t * src, int w, int h, int x, int y) { + return (float)src[saturate(x, w) + saturate(y, h) * w]; +} + +static __global__ void convolution_vertical(kernel_params p, uint8_t * src, uint8_t * dest) { + // coordinate of column processed by this thread + const int x = threadIdx.x + blockIdx.x * TX_V; + + // stop if thread's column is out of bounds + if(x >= p.w) { + return; + } + + // thread's private cache for last values (organized as circular buffer) + float cache[KERNEL_SIZE]; + + // y-coordinate of next pixel to be saved and pointer to it + int dest_y = threadIdx.y + blockIdx.y * TY_V; + uint8_t * dest_ptr = dest + x + dest_y * p.w; + + // y-coordinate of next pixel to be loaded and pointer to it + int src_y = dest_y - MAX_KERNEL_RADIUS; + const uint8_t * src_pix = src + x + saturate(src_y, p.h) * p.w; + + // populate the cache, except of last item + #pragma unroll + for(int n = 0; n < MAX_KERNEL_RADIUS * 2; n++) { + // load next pixel + cache[n] = *src_pix; + + // advance the pointer only if not at the top or bottom of the image + src_y++; + if(src_y > 0 && src_y < p.h) { + src_pix += p.w; + } + } + + // keep loading more pixels, saving one output pixel for each loaded pixel + #pragma unroll + for(int n = 0; n < TY_V; n++) { + // load next sample to the buffer and possibly advance the pointer + cache[(n + KERNEL_SIZE - 1) % KERNEL_SIZE] = *src_pix; + src_y++; + if(src_y > 0 && src_y < p.h) { + src_pix += p.w; + } + + // compute value of current output + float val = cache[(n + MAX_KERNEL_RADIUS) % KERNEL_SIZE] * p.kernel[0]; + #pragma unroll + for(int k = 1; k <= MAX_KERNEL_RADIUS; k++) { + val += cache[(n + MAX_KERNEL_RADIUS + k) % KERNEL_SIZE] * p.kernel[k]; + val += cache[(n + MAX_KERNEL_RADIUS - k) % KERNEL_SIZE] * p.kernel[k]; + } + + // save current output only if not at the end + if(dest_y++ < p.h) { + *dest_ptr = saturate((int)val, 256); + } + dest_ptr += p.w; + } +} + + +static __global__ void convolution_horizontal(kernel_params p, uint8_t * src, uint8_t * dest) { + // coordinates of pixel processed by this thread + const int x = threadIdx.x + blockIdx.x * TX; + const int y = threadIdx.y + blockIdx.y * TY; + + // shared cache for processed pixels + __shared__ float cache[TY][TX + 2 * MAX_KERNEL_RADIUS]; + + // all threads populate shared cache + for(int nx = 0; nx < 2; nx++) { + cache[threadIdx.y][threadIdx.x + nx * TX] + = get_pix(src, p.w, p.h, x - MAX_KERNEL_RADIUS + nx * TX, y); + } + + // wait for all threads of block to finish their contribution to cache + __syncthreads(); + + // stop this thread if out of bounds + if(x >= p.w || y >= p.h) { + return; + } + + // get weighted sum of neighbors + float result = p.kernel[0] * cache[threadIdx.y][MAX_KERNEL_RADIUS + threadIdx.x]; + for(int k = 1; k <= MAX_KERNEL_RADIUS; k++) { + result += p.kernel[k] * cache[threadIdx.y][MAX_KERNEL_RADIUS + threadIdx.x + k]; + result += p.kernel[k] * cache[threadIdx.y][MAX_KERNEL_RADIUS + threadIdx.x - k]; + } + + // save result + dest[x + y * p.w] = saturate((int)result, 256); +} + + +static float gaussian(float sigma, float x) { + const float e = x / sigma; + return exp(-0.5 * e * e); +} + +int main(int argn, char ** argv) { + kernel_params params; + + if(argn < 4) { + usage("Wrong argument count", *argv); + } + + // read width and height + params.w = atoi(argv[1]); + params.h = atoi(argv[2]); + if(params.w < 1 || params.h < 1) { + usage("Both width and height must be positive integers", *argv); + } + const int pix_count = params.w * params.h; + + // read sigma and prepare normalized kernel (sum = 1) + const float sigma = atof(argv[3]); + float kernel_sum = 0.0f; + for(int k = 0; k <= MAX_KERNEL_RADIUS; k++) { + kernel_sum += params.kernel[k] = gaussian(sigma, k); + } + kernel_sum = 2.0 * kernel_sum - params.kernel[0]; + for(int k = 0; k <= MAX_KERNEL_RADIUS; k++) { + params.kernel[k] /= kernel_sum; + } + + // dump the kernel + printf("Convolution kernel:"); + for(int k = -MAX_KERNEL_RADIUS; k <= MAX_KERNEL_RADIUS; k++) { + printf(" %f", params.kernel[k < 0 ? -k : k]); + } + printf("\n"); + + // prepare buffers + uint8_t * data_ptr; + uint8_t * data_gpu_ptr[2]; + uint8_t * temp_gpu_ptr[2]; + cudaMalloc((void**)(data_gpu_ptr + 0), pix_count); + cudaMalloc((void**)(temp_gpu_ptr + 0), pix_count); + cudaMalloc((void**)(data_gpu_ptr + 1), pix_count); + cudaMalloc((void**)(temp_gpu_ptr + 1), pix_count); + cudaMallocHost((void**)&data_ptr, pix_count, cudaHostAllocDefault); + + // two CUDA streams for asynchronous kernel and data transfers + cudaStream_t streams[2]; + cudaStreamCreate(streams + 0); + cudaStreamCreate(streams + 1); + + // measure time of processing of all images + const double begin = timer_ms(); + for(int i = 3; i <= argn; i++) { + // index of I/O buffers in this iteration + const int io_idx = i & 1; + + // index of computing resources in this iteration + const int comp_idx = io_idx ^ 1; + + // start processing of image loaded in previous iteration + // (except of first and last iteration) + if(i > 3 && i < argn) { + // launch vertical and horizontal pass + dim3 block_h(TX, TY); + dim3 block_v(TX_V, 1); + dim3 grid_h((params.w + TX - 1) / TX, (params.h + TY - 1) / TY); + dim3 grid_v((params.w + TX_V - 1) / TX_V, (params.h + TY_V - 1) / TY_V); + convolution_vertical<<>> + (params, data_gpu_ptr[comp_idx], temp_gpu_ptr[comp_idx]); + convolution_horizontal<<>> + (params, temp_gpu_ptr[comp_idx], data_gpu_ptr[comp_idx]); + } + + // processing now runs asynchronously on the GPU => save reauls + // from previous iteration (except of two first iterations) + if(i > 4) { + // copy data back from GPU + cudaMemcpyAsync(data_ptr, data_gpu_ptr[io_idx], pix_count, + cudaMemcpyDeviceToHost, streams[io_idx]); + + // compose output filename + char out_path[MAX_PATH_LEN + 1]; + snprintf(out_path, MAX_PATH_LEN, "%s.out.gray", argv[i - 1]); + + // wait for the data to actually appear in the buffer + cudaStreamSynchronize(streams[io_idx]); + + // write data to output file + FILE * const out_file = fopen(out_path, "wb"); + if(NULL == out_file || 1 != fwrite(data_ptr, pix_count, 1, out_file)) { + error(out_path); + } + fclose(out_file); + } + + // load input for next iteration (except of two last iterations) + if(i < (argn - 1)) { + // read input file + printf("Processing '%s'\n", argv[i + 1]); + FILE * const src_file = fopen(argv[i + 1], "rb"); + if(NULL == src_file || 1 != fread(data_ptr, pix_count, 1, src_file)) { + error(argv[i + 1]); + } + fclose(src_file); + + // copy data to GPU memory + cudaMemcpyAsync(data_gpu_ptr[io_idx], data_ptr, pix_count, + cudaMemcpyHostToDevice, streams[io_idx]); + + // make sure that the buffer is ready for next iteration + cudaStreamSynchronize(streams[io_idx]); + } + } + const double end = timer_ms(); + + // print total time + printf("time: %f ms, %d images => %f ms/image\n", + end - begin, argn - 4, (end - begin) / (argn - 4)); + + // cleanup + cudaStreamDestroy(streams[0]); + cudaStreamDestroy(streams[1]); + cudaFree(data_gpu_ptr[0]); + cudaFree(temp_gpu_ptr[0]); + cudaFree(data_gpu_ptr[1]); + cudaFree(temp_gpu_ptr[1]); + cudaFreeHost(data_ptr); + return 0; +} +