Add test cases for CUDA features.

- CUDA __constant__ memory
- CUDA global __device__ memory
- CUDA __host__ prefix
- CUDA kernel launch with arguments
- CUDA templated kernels
- CUDA shared memory with dynamic runtime
- CUDA Streams
- test if CUDA device is available
This commit is contained in:
Simeon Ehrig 2018-04-24 16:45:35 +02:00 committed by sftnight
parent d14ab2daec
commit e7b0e22ae8
8 changed files with 423 additions and 0 deletions

View File

@ -0,0 +1,47 @@
//------------------------------------------------------------------------------
// CLING - the C++ LLVM-based InterpreterG :)
//
// This file is dual-licensed: you can choose to license it under the University
// of Illinois Open Source License or the GNU Lesser General Public License. See
// LICENSE.TXT for details.
//------------------------------------------------------------------------------
// The Test checks if constant memory works.
// RUN: cat %s | %cling -x cuda -Xclang -verify 2>&1 | FileCheck %s
// REQUIRES: cuda-runtime
__constant__ int constNum[4];
.rawInput 1
__global__ void gKernel1(int * output){
int i = threadIdx.x;
output[i] = constNum[i];
}
int hostInput[4] = {1,2,3,4};
int hostOutput[4] = {0,0,0,0};
int * deviceOutput;
cudaMalloc( (void **) &deviceOutput, sizeof(int)*4)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
cudaMemcpyToSymbol(constNum, &hostInput, sizeof(int)*4)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
gKernel1<<<1,4>>>(deviceOutput);
cudaGetLastError()
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
cudaMemcpy(&hostOutput, deviceOutput, sizeof(int)*4, cudaMemcpyDeviceToHost)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
// FIXME: output of the whole static array isn't working at the moment
hostOutput[0]
// CHECK: (int) 1
hostOutput[1]
// CHECK: (int) 2
hostOutput[2]
// CHECK: (int) 3
hostOutput[3]
// CHECK: (int) 4
// expected-no-diagnostics
.q

View File

@ -0,0 +1,65 @@
//------------------------------------------------------------------------------
// CLING - the C++ LLVM-based InterpreterG :)
//
// This file is dual-licensed: you can choose to license it under the University
// of Illinois Open Source License or the GNU Lesser General Public License. See
// LICENSE.TXT for details.
//------------------------------------------------------------------------------
// The Test checks if global __device__ memory works. There two tests. One use
// direct value assignment at declaration and the other use a reassignment.
// RUN: cat %s | %cling -x cuda -Xclang -verify 2>&1 | FileCheck %s
// REQUIRES: cuda-runtime
__device__ int dAnswer = 42;
.rawInput 1
__global__ void gKernel1(int * output){
int i = threadIdx.x;
output[i] = dAnswer;
}
.rawInput 0
int hostOutput[4] = {1,1,1,1};
int * deviceOutput;
cudaMalloc( (void **) &deviceOutput, sizeof(int)*4)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
gKernel1<<<1,4>>>(deviceOutput);
cudaGetLastError()
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
cudaMemcpy(&hostOutput, deviceOutput, sizeof(int)*4, cudaMemcpyDeviceToHost)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
// FIXME: output of the whole static array isn't working at the moment
hostOutput[0]
// CHECK: (int) 42
hostOutput[1]
// CHECK: (int) 42
hostOutput[2]
// CHECK: (int) 42
hostOutput[3]
// CHECK: (int) 42
// Test, if value assignment also works.
dAnswer = 43;
gKernel1<<<1,4>>>(deviceOutput);
cudaGetLastError()
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
cudaMemcpy(&hostOutput, deviceOutput, sizeof(int)*4, cudaMemcpyDeviceToHost)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
// FIXME: output of the whole static array isn't working at the moment
hostOutput[0]
// CHECK: (int) 43
hostOutput[1]
// CHECK: (int) 43
hostOutput[2]
// CHECK: (int) 43
hostOutput[3]
// CHECK: (int) 43
// expected-no-diagnostics
.q

View File

@ -0,0 +1,43 @@
//------------------------------------------------------------------------------
// CLING - the C++ LLVM-based InterpreterG :)
//
// This file is dual-licensed: you can choose to license it under the University
// of Illinois Open Source License or the GNU Lesser General Public License. See
// LICENSE.TXT for details.
//------------------------------------------------------------------------------
// The Test checks if a function with __host__ and __device__ prefix available
// on host and device side.
// RUN: cat %s | %cling -x cuda -Xclang -verify 2>&1 | FileCheck %s
// REQUIRES: cuda-runtime
.rawInput 1
__host__ __device__ int sum(int a, int b){
return a + b;
}
__global__ void gKernel1(int * output){
*output = sum(40,2);
}
.rawInput 0
sum(41,1)
// CHECK: (int) 42
int hostOutput = 0;
int * deviceOutput;
cudaMalloc( (void **) &deviceOutput, sizeof(int))
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
gKernel1<<<1,1>>>(deviceOutput);
cudaGetLastError()
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
cudaMemcpy(&hostOutput, deviceOutput, sizeof(int), cudaMemcpyDeviceToHost)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
hostOutput
// CHECK: (int) 42
// expected-no-diagnostics
.q

View File

@ -0,0 +1,61 @@
//------------------------------------------------------------------------------
// CLING - the C++ LLVM-based InterpreterG :)
//
// This file is dual-licensed: you can choose to license it under the University
// of Illinois Open Source License or the GNU Lesser General Public License. See
// LICENSE.TXT for details.
//------------------------------------------------------------------------------
// The Test checks if a CUDA kernel works with a arguments and built-in
// functions.
// RUN: cat %s | %cling -x cuda -Xclang -verify 2>&1 | FileCheck %s
// REQUIRES: cuda-runtime
// Test, if a simple kernel with arguments works.
.rawInput 1
__global__ void gKernel1(int * out){
*out = 42;
}
.rawInput 0
int * deviceOutput;
int hostOutput = 0;
cudaMalloc( (void **) &deviceOutput, sizeof(int))
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
gKernel1<<<1,1>>>(deviceOutput);
cudaGetLastError()
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
cudaMemcpy(&hostOutput, deviceOutput, sizeof(int), cudaMemcpyDeviceToHost)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
hostOutput
// CHECK: (int) 42
// Test, if a parallel kernel with built-in functions.
.rawInput 1
__device__ int mul7(int in){
return 7*in;
}
__global__ void gKernel2(int * out){
int i = threadIdx.x;
out[i] = mul7(i);
}
.rawInput 0
int * deviceOutput2;
int hostOutput2[4] = {0,0,0,0};
cudaMalloc( (void **) &deviceOutput2, sizeof(int)*4)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
gKernel2<<<1,4>>>(deviceOutput2);
cudaGetLastError()
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
cudaMemcpy(&hostOutput2, deviceOutput2, sizeof(int)*4, cudaMemcpyDeviceToHost)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
hostOutput2[0] + hostOutput2[1] + hostOutput2[2] + hostOutput2[3]
// CHECK: (int) 42
// expected-no-diagnostics
.q

View File

@ -0,0 +1,58 @@
//------------------------------------------------------------------------------
// CLING - the C++ LLVM-based InterpreterG :)
//
// This file is dual-licensed: you can choose to license it under the University
// of Illinois Open Source License or the GNU Lesser General Public License. See
// LICENSE.TXT for details.
//------------------------------------------------------------------------------
// The Test checks if templated CUDA kernel works.
// RUN: cat %s | %cling -x cuda -Xclang -verify 2>&1 | FileCheck %s
// REQUIRES: cuda-runtime
// Check if template device side resoultion works.
.rawInput 1
template <int T>
__device__ int dKernel1(){
return T;
}
__global__ void gKernel1(int * out){
*out = dKernel1<42>();
}
.rawInput 0
int * deviceOutput;
int hostOutput = 0;
cudaMalloc( (void **) &deviceOutput, sizeof(int))
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
gKernel1<<<1,1>>>(deviceOutput);
cudaGetLastError()
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
cudaMemcpy(&hostOutput, deviceOutput, sizeof(int), cudaMemcpyDeviceToHost)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
hostOutput
// CHECK: (int) 42
// Check if template host-device side resoultion works.
.rawInput 1
template <int T>
__global__ void gKernel2(int * out){
*out = dKernel1<T>();
}
.rawInput 0
hostOutput = 0;
gKernel2<43><<<1,1>>>(deviceOutput);
cudaGetLastError()
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
cudaMemcpy(&hostOutput, deviceOutput, sizeof(int), cudaMemcpyDeviceToHost)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
hostOutput
// CHECK: (int) 43
// expected-no-diagnostics
.q

View File

@ -0,0 +1,51 @@
//------------------------------------------------------------------------------
// CLING - the C++ LLVM-based InterpreterG :)
//
// This file is dual-licensed: you can choose to license it under the University
// of Illinois Open Source License or the GNU Lesser General Public License. See
// LICENSE.TXT for details.
//------------------------------------------------------------------------------
// The Test checks if runtime shared memory works.
// RUN: cat %s | %cling -x cuda -Xclang -verify 2>&1 | FileCheck %s
// REQUIRES: cuda-runtime
.rawInput 1
__global__ void gKernel1(int * input, int * output){
extern __shared__ int s[];
int i = threadIdx.x;
s[i] = input[i];
output[i] = s[i];
}
.rawInput 0
int hostInput[4] = {1,2,3,4};
int hostOutput[4] = {0,0,0,0};
int * deviceInput;
int * deviceOutput;
cudaMalloc( (void **) &deviceInput, sizeof(int)*4)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
cudaMalloc( (void **) &deviceOutput, sizeof(int)*4)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
cudaMemcpy(deviceInput, &hostInput, sizeof(int)*4, cudaMemcpyHostToDevice)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
gKernel1<<<1,4, 4 * sizeof(int)>>>(deviceInput, deviceOutput);
cudaGetLastError()
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
cudaMemcpy(&hostOutput, deviceOutput, sizeof(int)*4, cudaMemcpyDeviceToHost)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
// FIXME: output of the whole static array isn't working at the moment
hostOutput[0]
// CHECK: (int) 1
hostOutput[1]
// CHECK: (int) 2
hostOutput[2]
// CHECK: (int) 3
hostOutput[3]
// CHECK: (int) 4
// expected-no-diagnostics
.q

View File

@ -0,0 +1,40 @@
//------------------------------------------------------------------------------
// CLING - the C++ LLVM-based InterpreterG :)
//
// This file is dual-licensed: you can choose to license it under the University
// of Illinois Open Source License or the GNU Lesser General Public License. See
// LICENSE.TXT for details.
//------------------------------------------------------------------------------
// The Test checks if a CUDA compatible device is available and checks, if simple
// __global__ and __device__ kernels are running.
// RUN: cat %s | %cling -x cuda -Xclang -verify 2>&1 | FileCheck %s
// REQUIRES: cuda-runtime
// Check if a CUDA compatible device (GPU) is available.
int device_count = 0;
cudaGetDeviceCount(&device_count)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
device_count > 0
// CHECK: (bool) true
// Check, if the smallest __global__ kernel is callable.
.rawInput 1
__global__ void gKernel1(){}
.rawInput 0
gKernel1<<<1,1>>>();
cudaGetLastError()
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
// Check, if a simple __device__ kernel is useable.
.rawInput 1
__device__ int dKernel1(){return 42;}
__global__ void gKernel2(){int i = dKernel1();}
.rawInput 0
gKernel2<<<1,1>>>();
cudaGetLastError()
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
// expected-no-diagnostics
.q

View File

@ -0,0 +1,58 @@
//------------------------------------------------------------------------------
// CLING - the C++ LLVM-based InterpreterG :)
//
// This file is dual-licensed: you can choose to license it under the University
// of Illinois Open Source License or the GNU Lesser General Public License. See
// LICENSE.TXT for details.
//------------------------------------------------------------------------------
// The Test checks if cuda streams works.
// RUN: cat %s | %cling -x cuda -Xclang -verify 2>&1 | FileCheck %s
// REQUIRES: cuda-runtime
.rawInput 1
__global__ void gKernel1(int * a, int b){
int i = threadIdx.x;
a[i] += b;
}
.rawInput 0
cudaStream_t stream1, stream2;
cudaStreamCreate(&stream1)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
cudaStreamCreate(&stream2)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
int host1[] = {1,2,3,4};
int host2[] = {11,12,13,14};
int * device1;
int * device2;
cudaMalloc( (void **) &device1, sizeof(int)*4)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
cudaMalloc( (void **) &device2, sizeof(int)*4)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
cudaMemcpyAsync(device1, &host1, sizeof(int)*4, cudaMemcpyHostToDevice, stream1)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
cudaMemcpyAsync(device2, &host2, sizeof(int)*4, cudaMemcpyHostToDevice, stream2)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
gKernel1<<<1,4,0,stream2>>>(device2, 2);
cudaGetLastError()
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
gKernel1<<<1,4,0,stream1>>>(device1, 1);
cudaGetLastError()
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
cudaMemcpyAsync(&host2, device2, sizeof(int)*4, cudaMemcpyDeviceToHost, stream2)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
cudaMemcpyAsync(&host1, device1, sizeof(int)*4, cudaMemcpyDeviceToHost, stream1)
// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
host1[0] + host1[1] + host1[2] + host1[3]
// CHECK: (int) 14
host2[0] + host2[1] + host2[2] + host2[3]
// CHECK: (int) 58
// expected-no-diagnostics
.q