433 lines
15 KiB
C++
433 lines
15 KiB
C++
//--------------------------------------------------------------------*- C++ -*-
|
|
// CLING - the C++ LLVM-based InterpreterG :)
|
|
// author: Simeon Ehrig <s.ehrig@hzdr.de>
|
|
//
|
|
// This file is dual-licensed: you can choose to license it under the University
|
|
// of Illinois Open Source License or the GNU Lesser General Public License. See
|
|
// LICENSE.TXT for details.
|
|
//------------------------------------------------------------------------------
|
|
|
|
#include "cling/Interpreter/IncrementalCUDADeviceCompiler.h"
|
|
#include "cling/Interpreter/Interpreter.h"
|
|
#include "cling/Interpreter/InvocationOptions.h"
|
|
#include "cling/Interpreter/Transaction.h"
|
|
|
|
#include "clang/Basic/TargetOptions.h"
|
|
#include "clang/Frontend/CompilerInstance.h"
|
|
#include "clang/Lex/HeaderSearchOptions.h"
|
|
|
|
#include "llvm/Support/Process.h"
|
|
#include "llvm/Support/Program.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
#include <llvm/IR/LegacyPassManager.h>
|
|
#include <llvm/MC/TargetRegistry.h>
|
|
#include <llvm/Support/TargetSelect.h>
|
|
#include <llvm/Target/TargetMachine.h>
|
|
#include <llvm/Target/TargetOptions.h>
|
|
|
|
#include <algorithm>
|
|
#include <bitset>
|
|
#include <optional>
|
|
#include <string>
|
|
#include <system_error>
|
|
|
|
namespace cling {
|
|
|
|
IncrementalCUDADeviceCompiler::IncrementalCUDADeviceCompiler(
|
|
const std::string& filePath, const int optLevel,
|
|
const cling::InvocationOptions& invocationOptions,
|
|
const clang::CompilerInstance& CI)
|
|
: m_FilePath(filePath),
|
|
m_FatbinFilePath(CI.getCodeGenOpts().CudaGpuBinaryFileName) {
|
|
if (m_FatbinFilePath.empty()) {
|
|
llvm::errs() << "Error: CudaGpuBinaryFileNames can't be empty\n";
|
|
return;
|
|
}
|
|
|
|
setCuArgs(CI.getLangOpts(), invocationOptions,
|
|
CI.getCodeGenOpts().getDebugInfo(),
|
|
llvm::Triple(CI.getTargetOpts().Triple));
|
|
|
|
// cling -std=c++xx -Ox -x cuda -S --cuda-gpu-arch=sm_xx --cuda-device-only
|
|
// ${include headers} ${-I/paths} [-v] [-g] ${m_CuArgs->additionalPtxOpt}
|
|
argv = {"cling",
|
|
m_CuArgs->cppStdVersion.c_str(),
|
|
"-O" + std::to_string(optLevel),
|
|
"-x",
|
|
"cuda",
|
|
"-S",
|
|
std::string("--cuda-gpu-arch=sm_")
|
|
.append(std::to_string(m_CuArgs->smVersion)),
|
|
"--cuda-device-only"};
|
|
|
|
addHeaderSearchPathFlags(argv, CI.getHeaderSearchOptsPtr());
|
|
|
|
if (m_CuArgs->verbose)
|
|
argv.push_back("-v");
|
|
if (m_CuArgs->debug)
|
|
argv.push_back("-g");
|
|
argv.insert(argv.end(), m_CuArgs->additionalPtxOpt.begin(),
|
|
m_CuArgs->additionalPtxOpt.end());
|
|
|
|
// add included files to the cling ptx
|
|
for (const char* c : invocationOptions.CompilerOpts.Remaining) {
|
|
std::string s(c);
|
|
if (s.find("-include") == 0)
|
|
argv.push_back(s);
|
|
}
|
|
|
|
std::vector<const char*> argvChar;
|
|
argvChar.resize(argv.size() + 1);
|
|
|
|
std::transform(argv.begin(), argv.end(), argvChar.begin(),
|
|
[&](const std::string& s) { return s.c_str(); });
|
|
|
|
// argv list have to finish with a nullptr.
|
|
argvChar.push_back(nullptr);
|
|
|
|
// create incremental compiler instance
|
|
m_PTX_interp.reset(new Interpreter(argvChar.size(), argvChar.data()));
|
|
|
|
if (!m_PTX_interp) {
|
|
llvm::errs() << "Could not create PTX interpreter instance\n";
|
|
return;
|
|
}
|
|
|
|
// initialize NVPTX backend
|
|
LLVMInitializeNVPTXTargetInfo();
|
|
LLVMInitializeNVPTXTarget();
|
|
LLVMInitializeNVPTXTargetMC();
|
|
LLVMInitializeNVPTXAsmPrinter();
|
|
|
|
m_Init = true;
|
|
}
|
|
|
|
void IncrementalCUDADeviceCompiler::setCuArgs(
|
|
const clang::LangOptions& langOpts,
|
|
const cling::InvocationOptions& invocationOptions,
|
|
const clang::codegenoptions::DebugInfoKind debugInfo,
|
|
const llvm::Triple hostTriple) {
|
|
std::string cppStdVersion;
|
|
// Set the c++ standard. Just one condition is possible.
|
|
if (langOpts.CPlusPlus11)
|
|
cppStdVersion = "-std=c++11";
|
|
if (langOpts.CPlusPlus14)
|
|
cppStdVersion = "-std=c++14";
|
|
if (langOpts.CPlusPlus17)
|
|
cppStdVersion = "-std=c++1z";
|
|
if (langOpts.CPlusPlus20)
|
|
cppStdVersion = "-std=c++20";
|
|
|
|
if (cppStdVersion.empty())
|
|
llvm::errs()
|
|
<< "IncrementalCUDADeviceCompiler: No valid c++ standard is set.\n";
|
|
|
|
uint32_t smVersion = 35;
|
|
if (!invocationOptions.CompilerOpts.CUDAGpuArch.empty()) {
|
|
llvm::StringRef(invocationOptions.CompilerOpts.CUDAGpuArch)
|
|
.drop_front(3 /* sm_ */)
|
|
.getAsInteger(10, smVersion);
|
|
}
|
|
|
|
// FIXME : Should not reduce the fine granulated debug options to a simple.
|
|
// -g
|
|
bool debug = false;
|
|
if (debugInfo == clang::codegenoptions::DebugLineTablesOnly ||
|
|
debugInfo == clang::codegenoptions::LimitedDebugInfo ||
|
|
debugInfo == clang::codegenoptions::FullDebugInfo)
|
|
debug = true;
|
|
|
|
// FIXME : Cling has problems to detect these arguments.
|
|
/*
|
|
if(langOpts.CUDADeviceFlushDenormalsToZero)
|
|
m_CuArgs.additionalPtxOpt.push_back("-fcuda-flush-denormals-to-zero");
|
|
if(langOpts.CUDADeviceApproxTranscendentals)
|
|
m_CuArgs.additionalPtxOpt.push_back("-fcuda-approx-transcendentals");
|
|
if(langOpts.CUDAAllowVariadicFunctions)
|
|
m_CuArgs.additionalPtxOpt.push_back("-fcuda-allow-variadic-functions");
|
|
*/
|
|
std::vector<std::string> additionalPtxOpt;
|
|
|
|
// search for defines (-Dmacros=value) in the args and add them to the PTX
|
|
// compiler args
|
|
for (const char* arg : invocationOptions.CompilerOpts.Remaining) {
|
|
std::string s = arg;
|
|
if (s.compare(0, 2, "-D") == 0)
|
|
additionalPtxOpt.push_back(s);
|
|
}
|
|
|
|
// use custom CUDA SDK path
|
|
if(!invocationOptions.CompilerOpts.CUDAPath.empty()){
|
|
additionalPtxOpt.push_back("--cuda-path=" + invocationOptions.CompilerOpts.CUDAPath);
|
|
}
|
|
|
|
enum FatBinFlags {
|
|
AddressSize64 = 0x01,
|
|
HasDebugInfo = 0x02,
|
|
ProducerCuda = 0x04,
|
|
HostLinux = 0x10,
|
|
HostMac = 0x20,
|
|
HostWindows = 0x40
|
|
};
|
|
|
|
uint32_t fatbinFlags = FatBinFlags::ProducerCuda;
|
|
if (debug)
|
|
fatbinFlags |= FatBinFlags::HasDebugInfo;
|
|
|
|
if (hostTriple.isArch64Bit())
|
|
fatbinFlags |= FatBinFlags::AddressSize64;
|
|
|
|
if (hostTriple.isOSWindows())
|
|
fatbinFlags |= FatBinFlags::HostWindows;
|
|
else if (hostTriple.isOSDarwin())
|
|
fatbinFlags |= FatBinFlags::HostMac;
|
|
else
|
|
fatbinFlags |= FatBinFlags::HostLinux;
|
|
|
|
m_CuArgs.reset(new IncrementalCUDADeviceCompiler::CUDACompilerArgs(
|
|
cppStdVersion, hostTriple, smVersion, fatbinFlags,
|
|
invocationOptions.Verbose(), debug, additionalPtxOpt));
|
|
}
|
|
|
|
void IncrementalCUDADeviceCompiler::addHeaderSearchPathFlags(
|
|
std::vector<std::string>& argv,
|
|
const std::shared_ptr<clang::HeaderSearchOptions> &headerSearchOptions) {
|
|
for (clang::HeaderSearchOptions::Entry e :
|
|
headerSearchOptions->UserEntries) {
|
|
if (e.Group == clang::frontend::IncludeDirGroup::Quoted) {
|
|
argv.push_back("-iquote");
|
|
argv.push_back(e.Path);
|
|
}
|
|
|
|
if (e.Group == clang::frontend::IncludeDirGroup::Angled)
|
|
argv.push_back("-I" + e.Path);
|
|
}
|
|
}
|
|
|
|
// FIXME: add the same arguments as the cling::Interpreter class -> need some
|
|
// modifications in the cling::Transaction class to store information from the
|
|
// device compiler
|
|
bool IncrementalCUDADeviceCompiler::process(const std::string& input) {
|
|
if (!m_Init) {
|
|
llvm::errs()
|
|
<< "Error: Initializiation of CUDA Device Code Compiler failed\n";
|
|
return false;
|
|
}
|
|
|
|
Interpreter::CompilationResult CR = m_PTX_interp->process(input);
|
|
|
|
if (CR == Interpreter::CompilationResult::kFailure) {
|
|
llvm::errs() << "IncrementalCUDADeviceCompiler::process()\n"
|
|
<< "failed at compile ptx code\n";
|
|
return false;
|
|
}
|
|
|
|
// for example blocks which are not closed
|
|
if (CR == Interpreter::CompilationResult::kMoreInputExpected)
|
|
return true;
|
|
|
|
if (!generatePTX() || !generateFatbinary())
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// FIXME: see process()
|
|
bool IncrementalCUDADeviceCompiler::declare(const std::string& input) {
|
|
if (!m_Init) {
|
|
llvm::errs()
|
|
<< "Error: Initializiation of CUDA Device Code Compiler failed\n";
|
|
return false;
|
|
}
|
|
|
|
Interpreter::CompilationResult CR = m_PTX_interp->declare(input);
|
|
|
|
if (CR == Interpreter::CompilationResult::kFailure) {
|
|
llvm::errs() << "IncrementalCUDADeviceCompiler::declare()\n"
|
|
<< "failed at compile ptx code\n";
|
|
return false;
|
|
}
|
|
|
|
// for example blocks which are not closed
|
|
if (CR == Interpreter::CompilationResult::kMoreInputExpected)
|
|
return true;
|
|
|
|
if (!generatePTX() || !generateFatbinary())
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// FIXME: see process()
|
|
bool IncrementalCUDADeviceCompiler::parse(const std::string& input) const {
|
|
if (!m_Init) {
|
|
llvm::errs()
|
|
<< "Error: Initializiation of CUDA Device Code Compiler failed\n";
|
|
return false;
|
|
}
|
|
|
|
Interpreter::CompilationResult CR = m_PTX_interp->parse(input);
|
|
|
|
if (CR == Interpreter::CompilationResult::kFailure) {
|
|
llvm::errs() << "IncrementalCUDADeviceCompiler::parse()"
|
|
<< "failed at compile ptx code\n";
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool cling::IncrementalCUDADeviceCompiler::generatePTX() {
|
|
// delete compiled PTX code of last input
|
|
m_PTX_code = "";
|
|
|
|
llvm::Module* module = m_PTX_interp->getLastTransaction()->getModule();
|
|
|
|
std::string error;
|
|
auto Target =
|
|
llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
|
|
|
|
if (!Target) {
|
|
llvm::errs() << error;
|
|
return 1;
|
|
}
|
|
|
|
// is not important, because PTX does not use any object format
|
|
std::optional<llvm::Reloc::Model> RM =
|
|
std::optional<llvm::Reloc::Model>(llvm::Reloc::Model::PIC_);
|
|
|
|
llvm::TargetOptions TO = llvm::TargetOptions();
|
|
|
|
llvm::TargetMachine* targetMachine = Target->createTargetMachine(
|
|
module->getTargetTriple(),
|
|
std::string("sm_").append(std::to_string(m_CuArgs->smVersion)), "", TO,
|
|
RM);
|
|
module->setDataLayout(targetMachine->createDataLayout());
|
|
|
|
llvm::raw_svector_ostream dest(m_PTX_code);
|
|
|
|
llvm::legacy::PassManager pass;
|
|
// it's important to use the type assembler
|
|
// object file is not supported and do not make sense
|
|
llvm::CodeGenFileType FileType = llvm::CGFT_AssemblyFile;
|
|
|
|
if (targetMachine->addPassesToEmitFile(pass, dest, /*DwoOut*/ nullptr,
|
|
FileType)) {
|
|
llvm::errs() << "TargetMachine can't emit assembler code";
|
|
return 1;
|
|
}
|
|
|
|
return pass.run(*module);
|
|
}
|
|
|
|
bool IncrementalCUDADeviceCompiler::generateFatbinary() {
|
|
// FIXME: At the moment the fatbin code must be writen to a file so that
|
|
// CodeGen can use it. This should be replaced by a in-memory solution
|
|
// (e.g. virtual file).
|
|
std::error_code EC;
|
|
llvm::raw_fd_ostream os(m_FatbinFilePath, EC, llvm::sys::fs::OF_None);
|
|
if (EC) {
|
|
llvm::errs() << "ERROR: cannot generate file " << m_FatbinFilePath
|
|
<< "\n";
|
|
return false;
|
|
}
|
|
|
|
// implementation is adapted from clangJIT
|
|
// (https://github.com/hfinkel/llvm-project-cxxjit/blob/cxxjit/clang/lib/CodeGen/JIT.cpp)
|
|
// void *resolveFunction(const void *NTTPValues, const char **TypeStrings,
|
|
// unsigned Idx)
|
|
|
|
// The outer header of the fat binary is documented in the CUDA
|
|
// fatbinary.h header. As mentioned there, the overall size must be a
|
|
// multiple of eight, and so we must make sure that the PTX is.
|
|
// We also need to make sure that the buffer is explicitly null
|
|
// terminated (cuobjdump, at least, seems to assume that it is).
|
|
m_PTX_code += '\0';
|
|
while (m_PTX_code.size() % 8)
|
|
m_PTX_code += '\0';
|
|
|
|
// NVIDIA, unfortunatly, does not provide full documentation on their
|
|
// fatbin format. There is some information on the outer header block in
|
|
// the CUDA fatbinary.h header. Also, it is possible to figure out more
|
|
// about the format by creating fatbins using the provided utilities
|
|
// and then observing what cuobjdump reports about the resulting files.
|
|
// There are some other online references which shed light on the format,
|
|
// including https://reviews.llvm.org/D8397 and FatBinaryContext.{cpp,h}
|
|
// from the GPU Ocelot project (https://github.com/gtcasl/gpuocelot).
|
|
|
|
struct FatBinHeader {
|
|
uint32_t Magic; // 0x00
|
|
uint16_t Version; // 0x04
|
|
uint16_t HeaderSize; // 0x06
|
|
uint32_t DataSize; // 0x08
|
|
uint32_t unknown0c; // 0x0c
|
|
public:
|
|
FatBinHeader(uint32_t DataSize)
|
|
: Magic(0xba55ed50), Version(1), HeaderSize(sizeof(*this)),
|
|
DataSize(DataSize), unknown0c(0) {}
|
|
};
|
|
|
|
struct FatBinFileHeader {
|
|
uint16_t Kind; // 0x00
|
|
uint16_t unknown02; // 0x02
|
|
uint32_t HeaderSize; // 0x04
|
|
uint32_t DataSize; // 0x08
|
|
uint32_t unknown0c; // 0x0c
|
|
uint32_t CompressedSize; // 0x10
|
|
uint32_t SubHeaderSize; // 0x14
|
|
uint16_t VersionMinor; // 0x18
|
|
uint16_t VersionMajor; // 0x1a
|
|
uint32_t CudaArch; // 0x1c
|
|
uint32_t unknown20; // 0x20
|
|
uint32_t unknown24; // 0x24
|
|
uint32_t Flags; // 0x28
|
|
uint32_t unknown2c; // 0x2c
|
|
uint32_t unknown30; // 0x30
|
|
uint32_t unknown34; // 0x34
|
|
uint32_t UncompressedSize; // 0x38
|
|
uint32_t unknown3c; // 0x3c
|
|
uint32_t unknown40; // 0x40
|
|
uint32_t unknown44; // 0x44
|
|
FatBinFileHeader(uint32_t DataSize, uint32_t CudaArch, uint32_t Flags)
|
|
: Kind(1 /*PTX*/), unknown02(0x0101), HeaderSize(sizeof(*this)),
|
|
DataSize(DataSize), unknown0c(0), CompressedSize(0),
|
|
SubHeaderSize(HeaderSize - 8), VersionMinor(2), VersionMajor(4),
|
|
CudaArch(CudaArch), unknown20(0), unknown24(0), Flags(Flags),
|
|
unknown2c(0), unknown30(0), unknown34(0), UncompressedSize(0),
|
|
unknown3c(0), unknown40(0), unknown44(0) {}
|
|
};
|
|
|
|
FatBinFileHeader fatBinFileHeader(m_PTX_code.size(), m_CuArgs->smVersion,
|
|
m_CuArgs->fatbinFlags);
|
|
FatBinHeader fatBinHeader(m_PTX_code.size() + fatBinFileHeader.HeaderSize);
|
|
|
|
os.write((char*)&fatBinHeader, fatBinHeader.HeaderSize);
|
|
os.write((char*)&fatBinFileHeader, fatBinFileHeader.HeaderSize);
|
|
os << m_PTX_code;
|
|
|
|
return true;
|
|
}
|
|
|
|
void IncrementalCUDADeviceCompiler::dump() {
|
|
llvm::outs() << "CUDA device compiler is valid: " << m_Init << "\n"
|
|
<< "file path: " << m_FilePath << "\n"
|
|
<< "fatbin file path: " << m_FatbinFilePath << "\n"
|
|
<< "m_CuArgs c++ standard: " << m_CuArgs->cppStdVersion << "\n"
|
|
<< "m_CuArgs host triple: " << m_CuArgs->hostTriple.str()
|
|
<< "\n"
|
|
<< "m_CuArgs Nvidia SM Version: " << m_CuArgs->smVersion
|
|
<< "\n"
|
|
<< "m_CuArgs Fatbin Flags (see "
|
|
"IncrementalCUDADeviceCompiler::setCuArgs()): "
|
|
<< std::bitset<7>(m_CuArgs->fatbinFlags).to_string() << "\n"
|
|
<< "m_CuArgs verbose: " << m_CuArgs->verbose << "\n"
|
|
<< "m_CuArgs debug: " << m_CuArgs->debug << "\n";
|
|
llvm::outs() << "m_CuArgs additional clang nvptx options: ";
|
|
for (const std::string& s : m_CuArgs->additionalPtxOpt) {
|
|
llvm::outs() << s << " ";
|
|
}
|
|
llvm::outs() << "\n";
|
|
}
|
|
|
|
} // end namespace cling
|