Have Value::dump write unicode and strings with embedded data properly.
This commit is contained in:
parent
f39f1f7318
commit
7eea9235da
@ -25,8 +25,18 @@
|
||||
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Support/Format.h"
|
||||
#include "llvm/Support/raw_os_ostream.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <locale>
|
||||
|
||||
#ifdef LLVM_ON_WIN32
|
||||
#include <Shlwapi.h>
|
||||
#define strcasestr StrStrIA
|
||||
#pragma comment(lib, "Shlwapi.lib")
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
@ -250,7 +260,333 @@ namespace cling {
|
||||
Out << typeStr << " " << valueStr << '\n';
|
||||
}
|
||||
|
||||
namespace utf8 {
|
||||
// Adapted from utf8++
|
||||
enum {
|
||||
LEAD_SURROGATE_MIN = 0xd800u,
|
||||
TRAIL_SURROGATE_MIN = 0xdc00u,
|
||||
TRAIL_SURROGATE_MAX = 0xdfffu,
|
||||
LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10),
|
||||
CODE_POINT_MAX = 0x0010ffffu
|
||||
};
|
||||
|
||||
static uint8_t mask8(char Octet) {
|
||||
return static_cast<uint8_t>(Octet); // & 0xff
|
||||
}
|
||||
|
||||
static bool validate(const char* Str, size_t N,
|
||||
const std::locale& Loc, bool& isPrint) {
|
||||
for (size_t i = 0, N1 = (N-1); i < N; ++i) {
|
||||
uint8_t n;
|
||||
uint8_t C = mask8(Str[i]);
|
||||
isPrint = isPrint ? std::isprint(Str[i], Loc) : false;
|
||||
|
||||
if (C <= 0x7f)
|
||||
n = 0; // 0bbbbbbb
|
||||
else if ((C & 0xe0) == 0xc0)
|
||||
n = 1; // 110bbbbb
|
||||
else if ( C==0xed && i < N1 && (mask8(Str[i+1]) & 0xa0) == 0xa0)
|
||||
return false; //U+d800 to U+dfff
|
||||
else if ((C & 0xf0) == 0xe0)
|
||||
n = 2; // 1110bbbb
|
||||
else if ((C & 0xf8) == 0xf0)
|
||||
n = 3; // 11110bbb
|
||||
#if 0 // unnecessary in 4 byte UTF-8
|
||||
else if ((C & 0xfc) == 0xf8)
|
||||
n = 4; // 111110bb //byte 5
|
||||
else if ((C & 0xfe) == 0xfc) n = 5;
|
||||
// 1111110b //byte 6
|
||||
#endif
|
||||
else
|
||||
return false;
|
||||
|
||||
// n bytes matching 10bbbbbb follow ?
|
||||
for (uint8_t j = 0; j < n && i < N; ++j) {
|
||||
if (++i == N)
|
||||
return false;
|
||||
C = mask8(Str[i]);
|
||||
if ((C & 0xc0) != 0x80)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static uint8_t sequenceLength(const char* Ptr) {
|
||||
uint8_t lead = mask8(*Ptr);
|
||||
if (lead < 0x80)
|
||||
return 1;
|
||||
else if ((lead >> 5) == 0x6)
|
||||
return 2;
|
||||
else if ((lead >> 4) == 0xe)
|
||||
return 3;
|
||||
else if ((lead >> 3) == 0x1e)
|
||||
return 4;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static char32_t next(const char*& Ptr) {
|
||||
char32_t CP = mask8(*Ptr);
|
||||
switch (sequenceLength(Ptr)) {
|
||||
case 1: break;
|
||||
case 2:
|
||||
++Ptr;
|
||||
CP = ((CP << 6) & 0x7ff) + ((*Ptr) & 0x3f);
|
||||
break;
|
||||
case 3:
|
||||
++Ptr;
|
||||
CP = ((CP << 12) & 0xffff) + ((mask8(*Ptr) << 6) & 0xfff);
|
||||
++Ptr;
|
||||
CP += (*Ptr) & 0x3f;
|
||||
break;
|
||||
case 4:
|
||||
++Ptr;
|
||||
CP = ((CP << 18) & 0x1fffff) + ((mask8(*Ptr) << 12) & 0x3ffff);
|
||||
++Ptr;
|
||||
CP += (mask8(*Ptr) << 6) & 0xfff;
|
||||
++Ptr;
|
||||
CP += (*Ptr) & 0x3f;
|
||||
break;
|
||||
}
|
||||
++Ptr;
|
||||
return CP;
|
||||
}
|
||||
|
||||
// mimic isprint() for Unicode codepoints
|
||||
static bool isPrint(char32_t CP, const std::locale&) {
|
||||
// C0
|
||||
if (CP <= 0x1F || CP == 0x7F)
|
||||
return false;
|
||||
|
||||
// C1
|
||||
if (CP >= 0x80 && CP <= 0x9F)
|
||||
return false;
|
||||
|
||||
// line/paragraph separators
|
||||
if (CP == 0x2028 || CP == 0x2029)
|
||||
return false;
|
||||
|
||||
// bidirectional text control
|
||||
if (CP == 0x200E || CP == 0x200F || (CP >= 0x202A && CP <= 0x202E))
|
||||
return false;
|
||||
|
||||
// interlinears and generally specials
|
||||
if (CP >= 0xFFF9 && CP <= 0xFFFF)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Deals with ascii & utf8 characters being output into stdout
|
||||
// As the string is printed, check each character for:
|
||||
// 0. Valid printable character
|
||||
// 1. Unicode code page
|
||||
// 2. Valid format character \t, \n, \r, \f, \v
|
||||
// 3. Unknown; data
|
||||
// Until case 3 is reached, the string is ouput possibly escaped, but
|
||||
// otherwise unadulterated.
|
||||
// If case 3 is reached, back up until the last valid printable character
|
||||
// ( 0 & 1) and dump all remaining 2& 3 characters as hex.
|
||||
|
||||
class RawStringConverter {
|
||||
enum { kBufSize = 1024 };
|
||||
enum HexState { kText, kEsc, kHex, kEnd };
|
||||
|
||||
llvm::SmallString<kBufSize> m_Buf;
|
||||
std::locale m_Loc;
|
||||
bool m_Utf8Out;
|
||||
|
||||
class ByteDumper {
|
||||
RawStringConverter& m_Convert;
|
||||
const char* const m_End;
|
||||
const bool m_Utf8;
|
||||
bool m_HexRun;
|
||||
bool (*isPrintable)(char32_t, const std::locale&);
|
||||
|
||||
template <class T> static bool isPrint(char32_t C, const std::locale& L) {
|
||||
return std::isprint(T(C), L);
|
||||
}
|
||||
|
||||
public:
|
||||
ByteDumper(RawStringConverter& C, const char* E, bool Utf8)
|
||||
: m_Convert(C), m_End(E), m_Utf8(Utf8) {
|
||||
// cache the correct isprint variant rather than checking in a loop
|
||||
isPrintable = m_Convert.m_Utf8Out && m_Utf8 ? &utf8::isPrint :
|
||||
(m_Utf8 ? &isPrint<wchar_t> : &isPrint<char>);
|
||||
}
|
||||
|
||||
HexState operator() (const char*& Ptr, llvm::raw_ostream& Stream,
|
||||
bool ForceHex) {
|
||||
// Block allocate the next chunk
|
||||
if (!(m_Convert.m_Buf.size() % kBufSize))
|
||||
m_Convert.m_Buf.reserve(m_Convert.m_Buf.size() + kBufSize);
|
||||
|
||||
HexState State = kText;
|
||||
const char* const Start = Ptr;
|
||||
char32_t Char;
|
||||
if (m_Utf8) {
|
||||
Char = utf8::next(Ptr);
|
||||
if (Ptr > m_End) {
|
||||
// Invalid/bad encoding: dump the remaining as hex
|
||||
Ptr = Start;
|
||||
while (Ptr < m_End)
|
||||
Stream << "\\x" << llvm::format_hex_no_prefix(uint8_t(*Ptr++), 2);
|
||||
m_HexRun = true;
|
||||
return kHex;
|
||||
}
|
||||
} else
|
||||
Char = (*Ptr++ & 0xff);
|
||||
|
||||
const std::locale& Loc = m_Convert.m_Loc;
|
||||
// Assume more often than not -regular- strings are printed
|
||||
if (LLVM_UNLIKELY(!isPrintable(Char, Loc))) {
|
||||
m_HexRun = false;
|
||||
if (LLVM_UNLIKELY(ForceHex || !std::isspace(wchar_t(Char), Loc))) {
|
||||
if (Char > 0xffff)
|
||||
Stream << "\\U" << llvm::format_hex_no_prefix(uint32_t(Char), 8);
|
||||
else if (Char > 0xff)
|
||||
Stream << "\\u" << llvm::format_hex_no_prefix(uint16_t(Char), 4);
|
||||
else if (Char) {
|
||||
Stream << "\\x" << llvm::format_hex_no_prefix(uint8_t(Char), 2);
|
||||
m_HexRun = true;
|
||||
return kHex;
|
||||
} else
|
||||
Stream << "\\0";
|
||||
return kText;
|
||||
}
|
||||
|
||||
switch (Char) {
|
||||
case '\b': Stream << "\\b"; return kEsc;
|
||||
// \r isn't so great on Unix, what about Windows?
|
||||
case '\r': Stream << "\\r"; return kEsc;
|
||||
default: break;
|
||||
}
|
||||
State = kEsc;
|
||||
}
|
||||
|
||||
if (m_HexRun) {
|
||||
// If the last print was a hex code, and this is now a char that could
|
||||
// be interpreted as a continuation of that hex-sequence, close out
|
||||
// the string and use concatenation. {'\xea', 'B'} -> "\xea" "B"
|
||||
m_HexRun = false;
|
||||
if (std::isxdigit(wchar_t(Char), Loc))
|
||||
Stream << "\" \"";
|
||||
}
|
||||
if (m_Utf8)
|
||||
Stream << llvm::StringRef(Start, Ptr-Start);
|
||||
else
|
||||
Stream << char(Char);
|
||||
return State;
|
||||
}
|
||||
};
|
||||
|
||||
public:
|
||||
RawStringConverter() : m_Utf8Out(false) {
|
||||
if (!::strcasestr(m_Loc.name().c_str(), "utf-8")) {
|
||||
if (const char* LANG = ::getenv("LANG")) {
|
||||
if (::strcasestr(LANG, "utf-8")) {
|
||||
m_Loc = std::locale(LANG);
|
||||
m_Utf8Out = true;
|
||||
}
|
||||
}
|
||||
} else
|
||||
m_Utf8Out = true;
|
||||
}
|
||||
|
||||
llvm::StringRef convert(const char* const Start, size_t N) {
|
||||
const char* Ptr = Start;
|
||||
const char* const End = Start + N;
|
||||
|
||||
bool isUtf8 = Start[0] != '\"';
|
||||
if (!isUtf8) {
|
||||
bool isPrint = true;
|
||||
// A const char* string may not neccessarily be utf8.
|
||||
// When the locale can output utf8 strings, validate it as utf8 first.
|
||||
if (!m_Utf8Out) {
|
||||
while (isPrint && Ptr < End)
|
||||
isPrint = std::isprint(*Ptr++, m_Loc);
|
||||
} else
|
||||
isUtf8 = utf8::validate(Ptr, N, m_Loc, isPrint);
|
||||
|
||||
// Simple printable string, just return it now.
|
||||
if (isPrint)
|
||||
return llvm::StringRef(Start, N);
|
||||
|
||||
Ptr = Start;
|
||||
} else {
|
||||
assert(Start[0] == 'u' || Start[0] == 'U' || Start[0] == 'L'
|
||||
&& "Unkown string encoding");
|
||||
// Unicode string, assume valid
|
||||
isUtf8 = true;
|
||||
}
|
||||
|
||||
ByteDumper Dump(*this, End, isUtf8);
|
||||
{ // scope for llvm::raw_svector_ostream
|
||||
size_t LastGood = 0;
|
||||
HexState Hex = kText;
|
||||
llvm::raw_svector_ostream Strm(m_Buf);
|
||||
while ((Hex < kEnd) && (Ptr < End)) {
|
||||
const size_t LastPos = Ptr - Start;
|
||||
switch (Dump(Ptr, Strm, Hex==kHex)) {
|
||||
case kHex:
|
||||
// Printed hex char
|
||||
// Keep doing it as long as we haven't printed an escaped char
|
||||
Hex = (Hex == kEsc) ? kEnd : kHex;
|
||||
break;
|
||||
case kEsc:
|
||||
assert(Hex <= kEsc && "Escape code after hex shouldn't occur");
|
||||
// Mark this as the last good character printed
|
||||
if (Hex == kText)
|
||||
LastGood = LastPos;
|
||||
Hex = kEsc;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (Hex != kEnd)
|
||||
return Strm.str();
|
||||
|
||||
Ptr = Start + LastGood;
|
||||
m_Buf.resize(LastGood);
|
||||
}
|
||||
|
||||
llvm::raw_svector_ostream Strm(m_Buf);
|
||||
while (Ptr < End)
|
||||
Dump(Ptr, Strm, true);
|
||||
return Strm.str();
|
||||
}
|
||||
};
|
||||
|
||||
void Value::dump() const {
|
||||
print(cling::outs());
|
||||
// We need stream that doesn't close its file descriptor, thus we are not
|
||||
// using llvm::outs. Keeping file descriptor open we will be able to use
|
||||
// the results in pipes (Savannah #99234).
|
||||
llvm::raw_os_ostream Out(std::cout);
|
||||
|
||||
// Get the default type string representation
|
||||
Out << cling::valuePrinterInternal::printTypeInternal(*this);
|
||||
Out << " ";
|
||||
|
||||
// Get the value string representation, by printValue() method overloading
|
||||
const std::string Val = cling::valuePrinterInternal::printValueInternal(*this);
|
||||
|
||||
const char* Data = Val.data();
|
||||
const size_t N = Val.size();
|
||||
switch (N ? Data[0] : 0) {
|
||||
case 'u': case 'U': case 'L':
|
||||
if (N < 3 || Data[1] != '\"')
|
||||
break;
|
||||
// Unicode string, encoded as Utf-8
|
||||
case '\"':
|
||||
if (N > 2 && Data[N-1] == '\"') {
|
||||
// Drop the terminating " so Utf-8 errors can be detected ("\xeA")
|
||||
Out << RawStringConverter().convert(Data, N-1) << "\"\n";
|
||||
return;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
Out << Val << '\n';
|
||||
}
|
||||
} // end namespace cling
|
||||
|
@ -8,6 +8,20 @@
|
||||
|
||||
//RUN: cat %s | %cling -Xclang -verify 2>&1 | FileCheck %s
|
||||
|
||||
#include <stdlib.h>
|
||||
#ifdef _WIN32
|
||||
extern "C" int SetConsoleOutputCP(unsigned int);
|
||||
#endif
|
||||
|
||||
static void setLang(const char* Lang) {
|
||||
#ifdef _WIN32
|
||||
::SetConsoleOutputCP(strcmp("en_US.UTF-8")==0 ? 65001 : 20127);
|
||||
#else
|
||||
::setenv("LANG", Lang, 1);
|
||||
#endif
|
||||
}
|
||||
setLang("en_US.UTF-8");
|
||||
|
||||
const char* Data = (const char*) 0x01
|
||||
// CHECK: (const char *) 0x{{.+}} <invalid memory address>
|
||||
|
||||
@ -32,6 +46,70 @@ cling::printValue(&RawData)[13]
|
||||
// CHECK-NEXT: Line2
|
||||
// CHECK-NEXT: Line3"
|
||||
|
||||
"\x12""\x13"
|
||||
// CHECK: (const char [3]) "\x12\x13"
|
||||
|
||||
"ABCD" "\x10""\x15" "EFG"
|
||||
// CHECK-NEXT: (const char [10]) "ABCD\x10\x15" "EFG"
|
||||
|
||||
"ENDWITH" "\x11""\x07"
|
||||
// CHECK-NEXT: (const char [10]) "ENDWITH\x11\x07"
|
||||
|
||||
"\x03" "\x09" "BEGANWITH"
|
||||
// CHECK-NEXT: (const char [12]) "\x03\x09" "BEGANWITH"
|
||||
|
||||
"1233123213\n\n\n\f234\x3"
|
||||
// CHECK-NEXT: (const char [19]) "1233123213\x0a\x0a\x0a\x0c" "234\x03"
|
||||
|
||||
// Posing as UTF-8, but invalid
|
||||
// https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
||||
|
||||
"\xea"
|
||||
// CHECK-NEXT: (const char [2]) "\xea"
|
||||
|
||||
"\xea\xfb"
|
||||
// CHECK-NEXT: (const char [3]) "\xea\xfb"
|
||||
|
||||
"\xfe\xfe\xff\xff"
|
||||
// CHECK-NEXT: (const char [5]) "\xfe\xfe\xff\xff"
|
||||
|
||||
"\xfc\x80\x80\x80\x80\xaf"
|
||||
// CHECK-NEXT: (const char [7]) "\xfc\x80\x80\x80\x80\xaf"
|
||||
|
||||
"\xfc\x83\xbf\xbf\xbf\xbf"
|
||||
// CHECK-NEXT: (const char [7]) "\xfc\x83\xbf\xbf\xbf\xbf"
|
||||
|
||||
"\xed\xa0\x80"
|
||||
// CHECK-NEXT: (const char [4]) "\xed\xa0\x80"
|
||||
"\xed\xad\xbf"
|
||||
// CHECK-NEXT: (const char [4]) "\xed\xad\xbf"
|
||||
"\xed\xae\x80"
|
||||
// CHECK-NEXT: (const char [4]) "\xed\xae\x80"
|
||||
"\xed\xaf\xbf"
|
||||
// CHECK-NEXT: (const char [4]) "\xed\xaf\xbf"
|
||||
"\xed\xb0\x80"
|
||||
// CHECK-NEXT: (const char [4]) "\xed\xb0\x80"
|
||||
"\xed\xbe\x80"
|
||||
// CHECK-NEXT: (const char [4]) "\xed\xbe\x80"
|
||||
"\xed\xbf\xbf"
|
||||
// CHECK-NEXT: (const char [4]) "\xed\xbf\xbf"
|
||||
|
||||
"\xed\xa0\x80\xed\xb0\x80"
|
||||
// CHECK-NEXT: (const char [7]) "\xed\xa0\x80\xed\xb0\x80"
|
||||
"\xed\xa0\x80\xed\xbf\xbf"
|
||||
// CHECK-NEXT: (const char [7]) "\xed\xa0\x80\xed\xbf\xbf"
|
||||
"\xed\xad\xbf\xed\xb0\x80"
|
||||
// CHECK-NEXT: (const char [7]) "\xed\xad\xbf\xed\xb0\x80"
|
||||
"\xed\xad\xbf\xed\xbf\xbf"
|
||||
// CHECK-NEXT: (const char [7]) "\xed\xad\xbf\xed\xbf\xbf"
|
||||
"\xed\xae\x80\xed\xb0\x80"
|
||||
// CHECK-NEXT: (const char [7]) "\xed\xae\x80\xed\xb0\x80"
|
||||
"\xed\xae\x80\xed\xbf\xbf"
|
||||
// CHECK-NEXT: (const char [7]) "\xed\xae\x80\xed\xbf\xbf"
|
||||
"\xed\xaf\xbf\xed\xb0\x80"
|
||||
// CHECK-NEXT: (const char [7]) "\xed\xaf\xbf\xed\xb0\x80"
|
||||
"\xed\xaf\xbf\xed\xbf\xbf"
|
||||
// CHECK-NEXT: (const char [7]) "\xed\xaf\xbf\xed\xbf\xbf"
|
||||
|
||||
std::string(u8"UTF-8")
|
||||
// CHECK-NEXT: (std::string) "UTF-8"
|
||||
@ -69,5 +147,44 @@ const wchar_t* wides = L"wide";
|
||||
wides[3]
|
||||
// CHECK-NEXT: (const wchar_t) L'\x{{0+}}65'
|
||||
|
||||
// ASCII output
|
||||
|
||||
setLang("");
|
||||
|
||||
u"UTF-16 " u"\x394" u"\x3a6" u"\x3a9"
|
||||
// CHECK-NEXT: (const char16_t [11]) u"UTF-16 \u0394\u03a6\u03a9"
|
||||
|
||||
U"UTF-32\x262D\x2615\x265F"
|
||||
// CHECK-NEXT: (const char32_t [10]) U"UTF-32\u262d\u2615\u265f"
|
||||
|
||||
"\u20ac"
|
||||
// CHECk-NEXT: (const char [4]) "\xe2\x82\xac"
|
||||
|
||||
"\u2620\u2603\u2368"
|
||||
// CHECk-NEXT: (const char [10]) "\xe2\x98\xa0\xe2\x98\x83\xe2\x8d\xa8"
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
|
||||
static void ReadData(std::vector<char>& FData) {
|
||||
FILE *File = ::fopen("Strings.dat", "r");
|
||||
if (File) {
|
||||
::fseek(File, 0L, SEEK_END);
|
||||
const size_t N = ::ftell(File);
|
||||
FData.reserve(N+1);
|
||||
FData.resize(N);
|
||||
::fseek(File, 0L, SEEK_SET);
|
||||
::fread(&FData[0], N, 1, File);
|
||||
::fclose(File);
|
||||
}
|
||||
FData.push_back(0);
|
||||
}
|
||||
|
||||
std::vector<char> FDat;
|
||||
ReadData(FDat);
|
||||
(char*)FDat.data()
|
||||
// CHECk-NEXT: (char *) "deadbeeffeedfacec0ffeedebac1eecafebabe\xde\xad\xbe\xef\xfe\xed\xfa\xce\xc0\xff\xee\xde\xba\xc1\xee\xca\xfe\xba\xbe"
|
||||
|
||||
// expected-no-diagnostics
|
||||
.q
|
||||
|
1
test/Prompt/ValuePrinter/Strings.dat
Executable file
1
test/Prompt/ValuePrinter/Strings.dat
Executable file
@ -0,0 +1 @@
|
||||
deadbeeffeedfacec0ffeedebac1eecafebabeÞ¾ïþíúÎÀÿîÞºÁîÊþº¾
|
Loading…
x
Reference in New Issue
Block a user