#define __CL_ENABLE_EXCEPTIONS

#include <iostream>
#include <vector>
#include <algorithm>
#include <exception>
#include <string>
#include <omp.h>
#include <CL/cl.hpp>


double SortSTL(int *a, int N) {
	double start = omp_get_wtime();
	std::sort(a, a+N);
	return omp_get_wtime()-start;
}


double SortOMP(int *a, int N, int numThreads) {
	double start = omp_get_wtime();

	int i,j,k, t;
	for (k=2; k<=N; k=2*k) {
		for (j=k>>1; j>0; j=j>>1) {
			#pragma omp parallel for num_threads(numThreads)
			for (i=0; i<N; i++) {
				int ixj=i^j;
				if (ixj>i) {
					if ((i&k)==0 && a[i]>a[ixj]) { t=a[i]; a[i]=a[ixj]; a[ixj]=t; }
					if ((i&k)!=0 && a[i]<a[ixj]) { t=a[i]; a[i]=a[ixj]; a[ixj]=t; }
				}
			}
		}
	}

	return omp_get_wtime()-start;
}


// http://www.tools-of-computing.com/tc/CS/Sorts/bitonic_sort.htm

const char *source = 
	"__kernel void sort(__global int *a, int k, int j) \n"
	"{                                                 \n"
	"  int i = get_global_id(0);                       \n"
	"  int ixj=i^j;                                    \n"
	"  int t;                                          \n"
	"  if (ixj>i) {                                    \n"
	"     if ((i&k)==0 && a[i]>a[ixj])                 \n"
    "         { t=a[i]; a[i]=a[ixj]; a[ixj]=t; }       \n"
	"     if ((i&k)!=0 && a[i]<a[ixj])                 \n"
    "         { t=a[i]; a[i]=a[ixj]; a[ixj]=t; }       \n"
	"  }                                               \n"
	"}                                                 \n";

double SortOpenCL(int *a, int N, bool isCPU, const cl::Context &context, cl::CommandQueue &queue, cl::Kernel &kernel) {
	// can only use the host pointer when on the host
	cl::Buffer buffer = cl::Buffer(context, 
		CL_MEM_READ_WRITE | (isCPU?CL_MEM_USE_HOST_PTR:CL_MEM_COPY_HOST_PTR), 
		N*sizeof(cl_int), a);  // initialize with existing buffer contents

	double start = omp_get_wtime();
	int j,k;
	kernel.setArg(0, buffer);
	for (k=2; k<=N; k=2*k) {
		kernel.setArg(1, k);
		for (j=k>>1; j>0; j=j>>1) {
			kernel.setArg(2, j);
			queue.enqueueNDRangeKernel(kernel, cl::NDRange(), cl::NDRange(N), cl::NDRange());
			queue.finish();
		}
	}

	if (!isCPU) {
		cl_int *ptr = (cl_int *)queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, N*sizeof(cl_int));
		memcpy(a, ptr, N*sizeof(cl_int)); // not part of the sort itself, but counted in timing as it is supposed to be an in-place sort
	}

	return omp_get_wtime()-start;
}


void assert_sort(const std::string msg, const std::vector<int> &expected, const std::vector<int> &actual) {
	if (expected != actual)
		throw std::exception(("Sort " + msg + " failed").c_str());
}

// for a given array size N, sort an array various ways and generate timings in CSV format
void SortTest(bool &first, int N,
	const cl::Context &cpuContext, cl::CommandQueue &cpuQueue, cl::Kernel &cpuKernel,
	const cl::Context &gpuContext, cl::CommandQueue &gpuQueue, cl::Kernel &gpuKernel) {
	std::vector<int> unsorted;
	unsorted.resize(N);
	std::srand(0);
	for (int i=0; i<N; i++)
		unsorted[i] = std::rand();
	std::vector<int> sorted(unsorted.begin(), unsorted.end());
	std::sort(sorted.begin(), sorted.end());  // so can compare sort results for correctness

	// single-threaded STL sort
	std::vector<int> a(unsorted.begin(), unsorted.end());
	double sortSTLTime = SortSTL(&a[0], a.size());
	assert_sort("SortSTL", sorted, a);

	// single-threaded OpenMP bitonic sort
	std::copy(unsorted.begin(), unsorted.end(), a.begin());
	double sortOMP1Time = SortOMP(&a[0], a.size(), 1);
	assert_sort("SortOMP1", sorted, a);

	// multi-threaded OpenMP bitonic sort
	std::copy(unsorted.begin(), unsorted.end(), a.begin());
	double sortOMPNTime = SortOMP(&a[0], a.size(), omp_get_max_threads());
	assert_sort("SortOMPN", sorted, a);

	// OpenCL with CPU device
	std::copy(unsorted.begin(), unsorted.end(), a.begin());
	double sortOpenCLCPUTime = SortOpenCL(&a[0], a.size(), true, cpuContext, cpuQueue, cpuKernel);
	assert_sort("SortOpenCLCPU", sorted, a);

	// OpenCL with GPU device
	std::copy(unsorted.begin(), unsorted.end(), a.begin());
	double sortOpenCLGPUTime = SortOpenCL(&a[0], a.size(), false, gpuContext, gpuQueue, gpuKernel);
	assert_sort("SortOpenCLGPU", sorted, a);

	if (first) {  // emit header near output for easier maintenance
		std::cout << "\"N\",\"STL\",\"OMP1\",\"OMPN\",\"OpenCLCPU\",\"OpenCLGPU\"" << std::endl;
		first = false;
	}
	std::cout << N << "," <<  sortSTLTime << "," << sortOMP1Time << "," << 
		sortOMPNTime << "," << sortOpenCLCPUTime << "," << sortOpenCLGPUTime << 
		std::endl;
}

void GetOpenCLObjects(std::vector<cl::Platform> &platforms, cl_device_type type, cl::Context &context, cl::CommandQueue &queue, cl::Kernel &kernel) {
	// create context and queue
	cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[0](), 0 };
	context = cl::Context(type, cprops);
	std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
	if (devices.size() == 0) 
		throw std::exception("Device not available");
	queue = cl::CommandQueue(context, devices[0]);

	// compile source, get kernel entry point
	cl::Program program;
	try {
		cl::Program::Sources sources(1, std::make_pair(source, strlen(source)));
		program = cl::Program(context, sources);
		program.build(devices);
		kernel = cl::Kernel(program, "sort");
	} catch (cl::Error &err) {
		// if it was a compilation error
		if (err.err() == CL_BUILD_PROGRAM_FAILURE)
			std::cout << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]) << std::endl;
		throw;
	}
}

int main() {
	try {
		// get a platform and device
		std::vector<cl::Platform> platforms;
		cl::Platform::get(&platforms);
		if (platforms.size() == 0) 
			throw std::exception("OpenCL not available");

		cl::Context cpuContext, gpuContext;
		cl::CommandQueue cpuQueue, gpuQueue;
		cl::Kernel cpuKernel, gpuKernel;
		GetOpenCLObjects(platforms, CL_DEVICE_TYPE_CPU, cpuContext, cpuQueue, cpuKernel);
		GetOpenCLObjects(platforms, CL_DEVICE_TYPE_GPU, gpuContext, gpuQueue, gpuKernel);
		
		bool first = true;
		for (int b=20; b<=25; b++)
			SortTest(first, 1<<b, 
			    cpuContext, cpuQueue, cpuKernel,
			    gpuContext, gpuQueue, gpuKernel);
	} catch (cl::Error &err) {
		std::cout << "Error: " << err.what() << "(" << err.err() << ")" << std:: endl;
	} catch (std::exception &e) {
		std::cout << "Exception: " << e.what() << std::endl;
	}

	return 0;
}
