From be5ea3a6517d1758ff58f061237edb1197453998 Mon Sep 17 00:00:00 2001 From: Simeon Ehrig Date: Thu, 11 Feb 2021 16:20:30 +0000 Subject: [PATCH] Fixed CUDA mode for Clang/LLVM 9 upgrade - fix bug, which was caused by executing a transaction in the device interpreter - fixed warning from the device compiler - update test cases --- lib/Interpreter/CIFactory.cpp | 5 +- lib/Interpreter/Interpreter.cpp | 9 ++-- test/CUDADeviceCode/CUDADefineArg.C | 6 +-- test/CUDADeviceCode/CUDAHostPrefix.C | 8 +-- test/CUDADeviceCode/CUDAInclude.C | 6 +-- test/CUDADeviceCode/CUDAKernelArgument.C | 16 +++--- .../CUDAKernelTemplateComplex.C | 50 +++++++++---------- .../CUDADeviceCode/CUDAKernelTemplateSimple.C | 14 +++--- test/CUDADeviceCode/CUDARegression.C | 12 ++--- test/CUDADeviceCode/CUDASharedMemory.C | 12 ++--- test/CUDADeviceCode/CUDASimpleKernel.C | 12 ++--- test/CUDADeviceCode/CUDAStreams.C | 24 ++++----- test/CodeGeneration/CUDACtorDtor.C | 9 +++- test/Driver/CUDAMode.C | 2 +- 14 files changed, 98 insertions(+), 87 deletions(-) diff --git a/lib/Interpreter/CIFactory.cpp b/lib/Interpreter/CIFactory.cpp index 3a1bc56d..851bc0bb 100644 --- a/lib/Interpreter/CIFactory.cpp +++ b/lib/Interpreter/CIFactory.cpp @@ -1339,7 +1339,10 @@ static void stringifyPreprocSetting(PreprocessorOptions& PPOpts, #endif if (!COpts.HasOutput || !HasInput) { - argvCompile.push_back("-c"); + // suppress the warning "argument unused during compilation: -c" of the + // device interpreter instance + if (!COpts.CUDADevice) + argvCompile.push_back("-c"); argvCompile.push_back("-"); } diff --git a/lib/Interpreter/Interpreter.cpp b/lib/Interpreter/Interpreter.cpp index cf0a2aba..ace79159 100644 --- a/lib/Interpreter/Interpreter.cpp +++ b/lib/Interpreter/Interpreter.cpp @@ -1670,9 +1670,12 @@ namespace cling { IncrementalExecutor::ExecutionResult ExeRes = IncrementalExecutor::kExeSuccess; - // Forward to IncrementalExecutor; should not be called by - // anyone except for IncrementalParser. - ExeRes = m_Executor->runStaticInitializersOnce(T); + // CUDA device code is not direct executable + // the code is executed by a CUDA library function in the host code + if (!m_Opts.CompilerOpts.CUDADevice) + // Forward to IncrementalExecutor; should not be called by + // anyone except for IncrementalParser. + ExeRes = m_Executor->runStaticInitializersOnce(T); return ConvertExecutionResult(ExeRes); } diff --git a/test/CUDADeviceCode/CUDADefineArg.C b/test/CUDADeviceCode/CUDADefineArg.C index 3a0a08bf..dd82bdf7 100644 --- a/test/CUDADeviceCode/CUDADefineArg.C +++ b/test/CUDADeviceCode/CUDADefineArg.C @@ -17,12 +17,12 @@ // Check if cuda driver is available int version; cudaDriverGetVersion(&version) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 // Check if a CUDA compatible device (GPU) is available. int device_count = 0; cudaGetDeviceCount(&device_count) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 device_count > 0 // CHECK: (bool) true @@ -42,7 +42,7 @@ __global__ void g(){ // the host interpreter. g<<<1,1>>>(); cudaGetLastError() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 // expected-no-diagnostics .q diff --git a/test/CUDADeviceCode/CUDAHostPrefix.C b/test/CUDADeviceCode/CUDAHostPrefix.C index a93d3949..34e91bed 100644 --- a/test/CUDADeviceCode/CUDAHostPrefix.C +++ b/test/CUDADeviceCode/CUDAHostPrefix.C @@ -28,15 +28,15 @@ sum(41,1) int hostOutput = 0; int * deviceOutput; cudaMalloc( (void **) &deviceOutput, sizeof(int)) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 gKernel1<<<1,1>>>(deviceOutput); cudaGetLastError() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaDeviceSynchronize() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMemcpy(&hostOutput, deviceOutput, sizeof(int), cudaMemcpyDeviceToHost) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 hostOutput // CHECK: (int) 42 diff --git a/test/CUDADeviceCode/CUDAInclude.C b/test/CUDADeviceCode/CUDAInclude.C index ea706edc..c6bb6b9b 100644 --- a/test/CUDADeviceCode/CUDAInclude.C +++ b/test/CUDADeviceCode/CUDAInclude.C @@ -17,12 +17,12 @@ // Check if cuda driver is available int version; cudaDriverGetVersion(&version) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 // Check if a CUDA compatible device (GPU) is available. int device_count = 0; cudaGetDeviceCount(&device_count) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 device_count > 0 // CHECK: (bool) true @@ -39,7 +39,7 @@ foo() // the host interpreter. bar<<<1,1>>>(); cudaGetLastError() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 // expected-no-diagnostics diff --git a/test/CUDADeviceCode/CUDAKernelArgument.C b/test/CUDADeviceCode/CUDAKernelArgument.C index 444ef26b..dd924eae 100644 --- a/test/CUDADeviceCode/CUDAKernelArgument.C +++ b/test/CUDADeviceCode/CUDAKernelArgument.C @@ -22,15 +22,15 @@ __global__ void gKernel1(int * out){ int * deviceOutput; int hostOutput = 0; cudaMalloc( (void **) &deviceOutput, sizeof(int)) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 gKernel1<<<1,1>>>(deviceOutput); cudaGetLastError() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaDeviceSynchronize() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMemcpy(&hostOutput, deviceOutput, sizeof(int), cudaMemcpyDeviceToHost) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 hostOutput // CHECK: (int) 42 @@ -55,15 +55,15 @@ for(unsigned int i = 0; i < numberOfThreads; ++i){ hostOutput2[i] = 0; } cudaMalloc( (void **) &deviceOutput2, sizeof(int)*numberOfThreads) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 gKernel2<<<1,numberOfThreads>>>(deviceOutput2); cudaGetLastError() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaDeviceSynchronize() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMemcpy(hostOutput2, deviceOutput2, sizeof(int)*numberOfThreads, cudaMemcpyDeviceToHost) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 unsigned int expectedSum = 0; unsigned int cudaSum = 0; diff --git a/test/CUDADeviceCode/CUDAKernelTemplateComplex.C b/test/CUDADeviceCode/CUDAKernelTemplateComplex.C index b1552d7e..abd06d41 100644 --- a/test/CUDADeviceCode/CUDAKernelTemplateComplex.C +++ b/test/CUDADeviceCode/CUDAKernelTemplateComplex.C @@ -23,15 +23,15 @@ __global__ void gKernel1(T * value){ int * deviceOutput1; int hostOutput1 = 1; cudaMalloc( (void **) &deviceOutput1, sizeof(int)) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 gKernel1<<<1,1>>>(deviceOutput1); cudaGetLastError() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaDeviceSynchronize() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMemcpy(&hostOutput1, deviceOutput1, sizeof(int), cudaMemcpyDeviceToHost) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 hostOutput1 // CHECK: (int) 42 @@ -55,27 +55,27 @@ __global__ void gKernel2(float * value){ int * deviceOutput2; int hostOutput2 = 10; cudaMalloc( (void **) &deviceOutput2, sizeof(int)) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 float * deviceOutput3; float hostOutput3= 10.0; cudaMalloc( (void **) &deviceOutput3, sizeof(float)) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 gKernel2<<<1,1>>>(deviceOutput2); cudaGetLastError() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaDeviceSynchronize() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 gKernel2<<<1,1>>>(deviceOutput3); cudaGetLastError() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaDeviceSynchronize() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMemcpy(&hostOutput2, deviceOutput2, sizeof(int), cudaMemcpyDeviceToHost) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMemcpy(&hostOutput3, deviceOutput3, sizeof(float), cudaMemcpyDeviceToHost) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 hostOutput2 // CHECK: (int) 1 @@ -107,19 +107,19 @@ int func1(int * input){ int * deviceOutput4; int hostOutput4 = 10; cudaMalloc( (void **) &deviceOutput4, sizeof(int)) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 int * deviceOutput5; cudaMalloc( (void **) &deviceOutput5, sizeof(int)) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 gKernel3<<<1,1>>>(deviceOutput4, func1(deviceOutput5)); cudaGetLastError() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaDeviceSynchronize() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMemcpy(&hostOutput4, deviceOutput4, sizeof(int), cudaMemcpyDeviceToHost) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 hostOutput4 // CHECK: (int) 5 @@ -155,27 +155,27 @@ __global__ void gKernel5(T * x, Functor const functor){ int * deviceOutput6; int hostOutput6 = 10; cudaMalloc( (void **) &deviceOutput6, sizeof(int)) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 double * deviceOutput7; double hostOutput7 = 10.0; cudaMalloc( (void **) &deviceOutput7, sizeof(double)) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 gKernel5<<<1,1>>>(deviceOutput6, Struct1{}); cudaGetLastError() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaDeviceSynchronize() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 gKernel5<<<1,1>>>(deviceOutput7, Struct1{}); cudaGetLastError() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaDeviceSynchronize() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMemcpy(&hostOutput6, deviceOutput6, sizeof(int), cudaMemcpyDeviceToHost) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMemcpy(&hostOutput7, deviceOutput7, sizeof(double), cudaMemcpyDeviceToHost) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 hostOutput6 // CHECK: (int) 1 diff --git a/test/CUDADeviceCode/CUDAKernelTemplateSimple.C b/test/CUDADeviceCode/CUDAKernelTemplateSimple.C index 8a6ae72b..7feba809 100644 --- a/test/CUDADeviceCode/CUDAKernelTemplateSimple.C +++ b/test/CUDADeviceCode/CUDAKernelTemplateSimple.C @@ -26,15 +26,15 @@ __global__ void gKernel1(int * out){ int * deviceOutput; int hostOutput = 0; cudaMalloc( (void **) &deviceOutput, sizeof(int)) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 gKernel1<<<1,1>>>(deviceOutput); cudaGetLastError() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaDeviceSynchronize() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMemcpy(&hostOutput, deviceOutput, sizeof(int), cudaMemcpyDeviceToHost) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 hostOutput // CHECK: (int) 42 @@ -50,11 +50,11 @@ __global__ void gKernel2(int * out){ hostOutput = 0; gKernel2<43><<<1,1>>>(deviceOutput); cudaGetLastError() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaDeviceSynchronize() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMemcpy(&hostOutput, deviceOutput, sizeof(int), cudaMemcpyDeviceToHost) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 hostOutput // CHECK: (int) 43 diff --git a/test/CUDADeviceCode/CUDARegression.C b/test/CUDADeviceCode/CUDARegression.C index 7aad3937..dc9731fb 100644 --- a/test/CUDADeviceCode/CUDARegression.C +++ b/test/CUDADeviceCode/CUDARegression.C @@ -16,7 +16,7 @@ // if process() works, the general input also works gCling->process("cudaGetLastError()"); -//CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +//CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 // declare a cuda kernel with with a define // do not this in real code ;-) @@ -27,16 +27,16 @@ gCling->declare("__global__ void g1(int * out){ *out = FOO;}"); int *d1; int h1 = 0; cudaMalloc((void**)&d1, sizeof(int)) -//CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +//CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 // run kernel g1<<<1,1>>>(d1); cudaGetLastError() -//CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +//CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 // check result cudaMemcpy(&h1, d1, sizeof(int), cudaMemcpyDeviceToHost) -//CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +//CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 h1 //CHECK: (int) 42 @@ -49,10 +49,10 @@ gCling->parse("__global__ void g2(int * out){ *out = 52;}"); g2<<<1,1>>>(d1); cudaGetLastError() -//CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +//CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMemcpy(&h1, d1, sizeof(int), cudaMemcpyDeviceToHost) -//CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +//CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 h1 //CHECK: (int) 52 diff --git a/test/CUDADeviceCode/CUDASharedMemory.C b/test/CUDADeviceCode/CUDASharedMemory.C index 851aee7a..a3430299 100644 --- a/test/CUDADeviceCode/CUDASharedMemory.C +++ b/test/CUDADeviceCode/CUDASharedMemory.C @@ -32,19 +32,19 @@ for(unsigned int i = 0; i < numberOfThreads; ++i){ int * deviceInput; int * deviceOutput; cudaMalloc( (void **) &deviceInput, sizeof(int)*numberOfThreads) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMalloc( (void **) &deviceOutput, sizeof(int)*numberOfThreads) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMemcpy(deviceInput, hostInput, sizeof(int)*numberOfThreads, cudaMemcpyHostToDevice) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 gKernel1<<<1,numberOfThreads, sizeof(int)*numberOfThreads>>>(deviceInput, deviceOutput); cudaGetLastError() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaDeviceSynchronize() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMemcpy(hostOutput, deviceOutput, sizeof(int)*numberOfThreads, cudaMemcpyDeviceToHost) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 int expectedSum = (numberOfThreads*(numberOfThreads+1))/2; int cudaSum = 0; diff --git a/test/CUDADeviceCode/CUDASimpleKernel.C b/test/CUDADeviceCode/CUDASimpleKernel.C index 187aaaa0..4fecaeb8 100644 --- a/test/CUDADeviceCode/CUDASimpleKernel.C +++ b/test/CUDADeviceCode/CUDASimpleKernel.C @@ -15,12 +15,12 @@ // Check if cuda driver is available int version; cudaDriverGetVersion(&version) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 // Check if a CUDA compatible device (GPU) is available. int device_count = 0; cudaGetDeviceCount(&device_count) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 device_count > 0 // CHECK: (bool) true @@ -30,9 +30,9 @@ __global__ void gKernel1(){} .rawInput 0 gKernel1<<<1,1>>>(); cudaGetLastError() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaDeviceSynchronize() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 // Check, if a simple __device__ kernel is useable. .rawInput 1 @@ -41,9 +41,9 @@ __global__ void gKernel2(){int i = dKernel1();} .rawInput 0 gKernel2<<<1,1>>>(); cudaGetLastError() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaDeviceSynchronize() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 // expected-no-diagnostics diff --git a/test/CUDADeviceCode/CUDAStreams.C b/test/CUDADeviceCode/CUDAStreams.C index b4fa2a4c..902c646f 100644 --- a/test/CUDADeviceCode/CUDAStreams.C +++ b/test/CUDADeviceCode/CUDAStreams.C @@ -22,9 +22,9 @@ __global__ void gKernel1(int * a, int b){ cudaStream_t stream1, stream2; cudaStreamCreate(&stream1) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaStreamCreate(&stream2) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 int hostInput1[numberOfThreads]; int hostInput2[numberOfThreads]; @@ -37,30 +37,30 @@ for(unsigned int i = 0; i < numberOfThreads; ++i){ int * device1; int * device2; cudaMalloc( (void **) &device1, sizeof(int)*numberOfThreads) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMalloc( (void **) &device2, sizeof(int)*numberOfThreads) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMemcpyAsync(device1, hostInput1, sizeof(int)*numberOfThreads, cudaMemcpyHostToDevice, stream1) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMemcpyAsync(device2, hostInput2, sizeof(int)*numberOfThreads, cudaMemcpyHostToDevice, stream2) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 gKernel1<<<1,numberOfThreads,0,stream2>>>(device2, 2); cudaGetLastError() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaDeviceSynchronize() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 gKernel1<<<1,numberOfThreads,0,stream1>>>(device1, 1); cudaGetLastError() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaDeviceSynchronize() -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMemcpyAsync(hostOutput2, device2, sizeof(int)*numberOfThreads, cudaMemcpyDeviceToHost, stream2) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 cudaMemcpyAsync(hostOutput1, device1, sizeof(int)*numberOfThreads, cudaMemcpyDeviceToHost, stream1) -// CHECK: (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError_t) (cudaSuccess) : (unsigned int) 0 unsigned int expectedSum1 = 0; unsigned int cudaSum1 = 0; diff --git a/test/CodeGeneration/CUDACtorDtor.C b/test/CodeGeneration/CUDACtorDtor.C index eb3b5410..f445e4ae 100644 --- a/test/CodeGeneration/CUDACtorDtor.C +++ b/test/CodeGeneration/CUDACtorDtor.C @@ -21,7 +21,10 @@ // Compare the cuda module ctor and dtor of two random modules. std::string ctor1, ctor2, dtor1, dtor2; -auto M1 = gCling->getLatestTransaction()->getModule(); +auto T1 = gCling->getLatestTransaction(); +// only __global__ CUDA kernel definitions has the cuda module ctor and dtor +__global__ void g1() { int i = 1; } +auto M1 = T1->getNext()->getModule(); for(auto &I : *M1){ // The trailing '_' identify the function name as modified name. @@ -34,7 +37,9 @@ for(auto &I : *M1){ } } -auto M2 = gCling->getLatestTransaction()->getModule(); +auto T2 = gCling->getLatestTransaction(); +__global__ void g2() { int i = 2; } +auto M2 = T2->getNext()->getModule(); // The two modules should have different names, because of the for loop. M1->getName().str() != M2->getName().str() diff --git a/test/Driver/CUDAMode.C b/test/Driver/CUDAMode.C index dac6d740..3cc790c9 100644 --- a/test/Driver/CUDAMode.C +++ b/test/Driver/CUDAMode.C @@ -19,7 +19,7 @@ __global__ void foo(){ int i = 3; } .rawInput 0 cudaError error -// CHECK: (cudaError) (cudaError::cudaSuccess) : (unsigned int) 0 +// CHECK: (cudaError) (cudaSuccess) : (unsigned int) 0 // expected-no-diagnostics .q