498 lines
13 KiB
C++
498 lines
13 KiB
C++
#include <algorithm>
|
|
#include <iomanip>
|
|
#include <sstream>
|
|
|
|
#include "emitterutils.h"
|
|
#include "exp.h"
|
|
#include "indentation.h"
|
|
#include "regex_yaml.h"
|
|
#include "regeximpl.h"
|
|
#include "stringsource.h"
|
|
#include "yaml-cpp/binary.h" // IWYU pragma: keep
|
|
#include "yaml-cpp/null.h"
|
|
#include "yaml-cpp/ostream_wrapper.h"
|
|
|
|
namespace YAML {
|
|
namespace Utils {
|
|
namespace {
|
|
enum { REPLACEMENT_CHARACTER = 0xFFFD };
|
|
|
|
bool IsAnchorChar(int ch) { // test for ns-anchor-char
|
|
switch (ch) {
|
|
case ',':
|
|
case '[':
|
|
case ']':
|
|
case '{':
|
|
case '}': // c-flow-indicator
|
|
case ' ':
|
|
case '\t': // s-white
|
|
case 0xFEFF: // c-byte-order-mark
|
|
case 0xA:
|
|
case 0xD: // b-char
|
|
return false;
|
|
case 0x85:
|
|
return true;
|
|
}
|
|
|
|
if (ch < 0x20) {
|
|
return false;
|
|
}
|
|
|
|
if (ch < 0x7E) {
|
|
return true;
|
|
}
|
|
|
|
if (ch < 0xA0) {
|
|
return false;
|
|
}
|
|
if (ch >= 0xD800 && ch <= 0xDFFF) {
|
|
return false;
|
|
}
|
|
if ((ch & 0xFFFE) == 0xFFFE) {
|
|
return false;
|
|
}
|
|
if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) {
|
|
return false;
|
|
}
|
|
if (ch > 0x10FFFF) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
int Utf8BytesIndicated(char ch) {
|
|
int byteVal = static_cast<unsigned char>(ch);
|
|
switch (byteVal >> 4) {
|
|
case 0:
|
|
case 1:
|
|
case 2:
|
|
case 3:
|
|
case 4:
|
|
case 5:
|
|
case 6:
|
|
case 7:
|
|
return 1;
|
|
case 12:
|
|
case 13:
|
|
return 2;
|
|
case 14:
|
|
return 3;
|
|
case 15:
|
|
return 4;
|
|
default:
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
bool IsTrailingByte(char ch) { return (ch & 0xC0) == 0x80; }
|
|
|
|
bool GetNextCodePointAndAdvance(int& codePoint,
|
|
std::string::const_iterator& first,
|
|
std::string::const_iterator last) {
|
|
if (first == last)
|
|
return false;
|
|
|
|
int nBytes = Utf8BytesIndicated(*first);
|
|
if (nBytes < 1) {
|
|
// Bad lead byte
|
|
++first;
|
|
codePoint = REPLACEMENT_CHARACTER;
|
|
return true;
|
|
}
|
|
|
|
if (nBytes == 1) {
|
|
codePoint = *first++;
|
|
return true;
|
|
}
|
|
|
|
// Gather bits from trailing bytes
|
|
codePoint = static_cast<unsigned char>(*first) & ~(0xFF << (7 - nBytes));
|
|
++first;
|
|
--nBytes;
|
|
for (; nBytes > 0; ++first, --nBytes) {
|
|
if ((first == last) || !IsTrailingByte(*first)) {
|
|
codePoint = REPLACEMENT_CHARACTER;
|
|
break;
|
|
}
|
|
codePoint <<= 6;
|
|
codePoint |= *first & 0x3F;
|
|
}
|
|
|
|
// Check for illegal code points
|
|
if (codePoint > 0x10FFFF)
|
|
codePoint = REPLACEMENT_CHARACTER;
|
|
else if (codePoint >= 0xD800 && codePoint <= 0xDFFF)
|
|
codePoint = REPLACEMENT_CHARACTER;
|
|
else if ((codePoint & 0xFFFE) == 0xFFFE)
|
|
codePoint = REPLACEMENT_CHARACTER;
|
|
else if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF)
|
|
codePoint = REPLACEMENT_CHARACTER;
|
|
return true;
|
|
}
|
|
|
|
void WriteCodePoint(ostream_wrapper& out, int codePoint) {
|
|
if (codePoint < 0 || codePoint > 0x10FFFF) {
|
|
codePoint = REPLACEMENT_CHARACTER;
|
|
}
|
|
if (codePoint <= 0x7F) {
|
|
out << static_cast<char>(codePoint);
|
|
} else if (codePoint <= 0x7FF) {
|
|
out << static_cast<char>(0xC0 | (codePoint >> 6))
|
|
<< static_cast<char>(0x80 | (codePoint & 0x3F));
|
|
} else if (codePoint <= 0xFFFF) {
|
|
out << static_cast<char>(0xE0 | (codePoint >> 12))
|
|
<< static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F))
|
|
<< static_cast<char>(0x80 | (codePoint & 0x3F));
|
|
} else {
|
|
out << static_cast<char>(0xF0 | (codePoint >> 18))
|
|
<< static_cast<char>(0x80 | ((codePoint >> 12) & 0x3F))
|
|
<< static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F))
|
|
<< static_cast<char>(0x80 | (codePoint & 0x3F));
|
|
}
|
|
}
|
|
|
|
bool IsValidPlainScalar(const std::string& str, FlowType::value flowType,
|
|
bool allowOnlyAscii) {
|
|
// check against null
|
|
if (IsNullString(str)) {
|
|
return false;
|
|
}
|
|
|
|
// check the start
|
|
const RegEx& start = (flowType == FlowType::Flow ? Exp::PlainScalarInFlow()
|
|
: Exp::PlainScalar());
|
|
if (!start.Matches(str)) {
|
|
return false;
|
|
}
|
|
|
|
// and check the end for plain whitespace (which can't be faithfully kept in a
|
|
// plain scalar)
|
|
if (!str.empty() && *str.rbegin() == ' ') {
|
|
return false;
|
|
}
|
|
|
|
// then check until something is disallowed
|
|
static const RegEx& disallowed_flow =
|
|
Exp::EndScalarInFlow() | (Exp::BlankOrBreak() + Exp::Comment()) |
|
|
Exp::NotPrintable() | Exp::Utf8_ByteOrderMark() | Exp::Break() |
|
|
Exp::Tab() | Exp::Ampersand();
|
|
static const RegEx& disallowed_block =
|
|
Exp::EndScalar() | (Exp::BlankOrBreak() + Exp::Comment()) |
|
|
Exp::NotPrintable() | Exp::Utf8_ByteOrderMark() | Exp::Break() |
|
|
Exp::Tab() | Exp::Ampersand();
|
|
const RegEx& disallowed =
|
|
flowType == FlowType::Flow ? disallowed_flow : disallowed_block;
|
|
|
|
StringCharSource buffer(str.c_str(), str.size());
|
|
while (buffer) {
|
|
if (disallowed.Matches(buffer)) {
|
|
return false;
|
|
}
|
|
if (allowOnlyAscii && (0x80 <= static_cast<unsigned char>(buffer[0]))) {
|
|
return false;
|
|
}
|
|
++buffer;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool IsValidSingleQuotedScalar(const std::string& str, bool escapeNonAscii) {
|
|
// TODO: check for non-printable characters?
|
|
return std::none_of(str.begin(), str.end(), [=](char ch) {
|
|
return (escapeNonAscii && (0x80 <= static_cast<unsigned char>(ch))) ||
|
|
(ch == '\n');
|
|
});
|
|
}
|
|
|
|
bool IsValidLiteralScalar(const std::string& str, FlowType::value flowType,
|
|
bool escapeNonAscii) {
|
|
if (flowType == FlowType::Flow) {
|
|
return false;
|
|
}
|
|
|
|
// TODO: check for non-printable characters?
|
|
return std::none_of(str.begin(), str.end(), [=](char ch) {
|
|
return (escapeNonAscii && (0x80 <= static_cast<unsigned char>(ch)));
|
|
});
|
|
}
|
|
|
|
std::pair<uint16_t, uint16_t> EncodeUTF16SurrogatePair(int codePoint) {
|
|
const uint32_t leadOffset = 0xD800 - (0x10000 >> 10);
|
|
|
|
return {
|
|
leadOffset | (codePoint >> 10),
|
|
0xDC00 | (codePoint & 0x3FF),
|
|
};
|
|
}
|
|
|
|
void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint, StringEscaping::value stringEscapingStyle) {
|
|
static const char hexDigits[] = "0123456789abcdef";
|
|
|
|
out << "\\";
|
|
int digits = 8;
|
|
if (codePoint < 0xFF && stringEscapingStyle != StringEscaping::JSON) {
|
|
out << "x";
|
|
digits = 2;
|
|
} else if (codePoint < 0xFFFF) {
|
|
out << "u";
|
|
digits = 4;
|
|
} else if (stringEscapingStyle != StringEscaping::JSON) {
|
|
out << "U";
|
|
digits = 8;
|
|
} else {
|
|
auto surrogatePair = EncodeUTF16SurrogatePair(codePoint);
|
|
WriteDoubleQuoteEscapeSequence(out, surrogatePair.first, stringEscapingStyle);
|
|
WriteDoubleQuoteEscapeSequence(out, surrogatePair.second, stringEscapingStyle);
|
|
return;
|
|
}
|
|
|
|
// Write digits into the escape sequence
|
|
for (; digits > 0; --digits)
|
|
out << hexDigits[(codePoint >> (4 * (digits - 1))) & 0xF];
|
|
}
|
|
|
|
bool WriteAliasName(ostream_wrapper& out, const std::string& str) {
|
|
int codePoint;
|
|
for (std::string::const_iterator i = str.begin();
|
|
GetNextCodePointAndAdvance(codePoint, i, str.end());) {
|
|
if (!IsAnchorChar(codePoint)) {
|
|
return false;
|
|
}
|
|
|
|
WriteCodePoint(out, codePoint);
|
|
}
|
|
return true;
|
|
}
|
|
} // namespace
|
|
|
|
StringFormat::value ComputeStringFormat(const std::string& str,
|
|
EMITTER_MANIP strFormat,
|
|
FlowType::value flowType,
|
|
bool escapeNonAscii) {
|
|
switch (strFormat) {
|
|
case Auto:
|
|
if (IsValidPlainScalar(str, flowType, escapeNonAscii)) {
|
|
return StringFormat::Plain;
|
|
}
|
|
return StringFormat::DoubleQuoted;
|
|
case SingleQuoted:
|
|
if (IsValidSingleQuotedScalar(str, escapeNonAscii)) {
|
|
return StringFormat::SingleQuoted;
|
|
}
|
|
return StringFormat::DoubleQuoted;
|
|
case DoubleQuoted:
|
|
return StringFormat::DoubleQuoted;
|
|
case Literal:
|
|
if (IsValidLiteralScalar(str, flowType, escapeNonAscii)) {
|
|
return StringFormat::Literal;
|
|
}
|
|
return StringFormat::DoubleQuoted;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return StringFormat::DoubleQuoted;
|
|
}
|
|
|
|
bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str) {
|
|
out << "'";
|
|
int codePoint;
|
|
for (std::string::const_iterator i = str.begin();
|
|
GetNextCodePointAndAdvance(codePoint, i, str.end());) {
|
|
if (codePoint == '\n') {
|
|
return false; // We can't handle a new line and the attendant indentation
|
|
// yet
|
|
}
|
|
|
|
if (codePoint == '\'') {
|
|
out << "''";
|
|
} else {
|
|
WriteCodePoint(out, codePoint);
|
|
}
|
|
}
|
|
out << "'";
|
|
return true;
|
|
}
|
|
|
|
bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str,
|
|
StringEscaping::value stringEscaping) {
|
|
out << "\"";
|
|
int codePoint;
|
|
for (std::string::const_iterator i = str.begin();
|
|
GetNextCodePointAndAdvance(codePoint, i, str.end());) {
|
|
switch (codePoint) {
|
|
case '\"':
|
|
out << "\\\"";
|
|
break;
|
|
case '\\':
|
|
out << "\\\\";
|
|
break;
|
|
case '\n':
|
|
out << "\\n";
|
|
break;
|
|
case '\t':
|
|
out << "\\t";
|
|
break;
|
|
case '\r':
|
|
out << "\\r";
|
|
break;
|
|
case '\b':
|
|
out << "\\b";
|
|
break;
|
|
case '\f':
|
|
out << "\\f";
|
|
break;
|
|
default:
|
|
if (codePoint < 0x20 ||
|
|
(codePoint >= 0x80 &&
|
|
codePoint <= 0xA0)) { // Control characters and non-breaking space
|
|
WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
|
|
} else if (codePoint == 0xFEFF) { // Byte order marks (ZWNS) should be
|
|
// escaped (YAML 1.2, sec. 5.2)
|
|
WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
|
|
} else if (stringEscaping == StringEscaping::NonAscii && codePoint > 0x7E) {
|
|
WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
|
|
} else {
|
|
WriteCodePoint(out, codePoint);
|
|
}
|
|
}
|
|
}
|
|
out << "\"";
|
|
return true;
|
|
}
|
|
|
|
bool WriteLiteralString(ostream_wrapper& out, const std::string& str,
|
|
std::size_t indent) {
|
|
out << "|\n";
|
|
int codePoint;
|
|
for (std::string::const_iterator i = str.begin();
|
|
GetNextCodePointAndAdvance(codePoint, i, str.end());) {
|
|
if (codePoint == '\n') {
|
|
out << "\n";
|
|
} else {
|
|
out<< IndentTo(indent);
|
|
WriteCodePoint(out, codePoint);
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool WriteChar(ostream_wrapper& out, char ch, StringEscaping::value stringEscapingStyle) {
|
|
if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')) {
|
|
out << ch;
|
|
} else if (ch == '\"') {
|
|
out << R"("\"")";
|
|
} else if (ch == '\t') {
|
|
out << R"("\t")";
|
|
} else if (ch == '\n') {
|
|
out << R"("\n")";
|
|
} else if (ch == '\b') {
|
|
out << R"("\b")";
|
|
} else if (ch == '\r') {
|
|
out << R"("\r")";
|
|
} else if (ch == '\f') {
|
|
out << R"("\f")";
|
|
} else if (ch == '\\') {
|
|
out << R"("\\")";
|
|
} else if (0x20 <= ch && ch <= 0x7e) {
|
|
out << "\"" << ch << "\"";
|
|
} else {
|
|
out << "\"";
|
|
WriteDoubleQuoteEscapeSequence(out, ch, stringEscapingStyle);
|
|
out << "\"";
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool WriteComment(ostream_wrapper& out, const std::string& str,
|
|
std::size_t postCommentIndent) {
|
|
const std::size_t curIndent = out.col();
|
|
out << "#" << Indentation(postCommentIndent);
|
|
out.set_comment();
|
|
int codePoint;
|
|
for (std::string::const_iterator i = str.begin();
|
|
GetNextCodePointAndAdvance(codePoint, i, str.end());) {
|
|
if (codePoint == '\n') {
|
|
out << "\n"
|
|
<< IndentTo(curIndent) << "#" << Indentation(postCommentIndent);
|
|
out.set_comment();
|
|
} else {
|
|
WriteCodePoint(out, codePoint);
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool WriteAlias(ostream_wrapper& out, const std::string& str) {
|
|
out << "*";
|
|
return WriteAliasName(out, str);
|
|
}
|
|
|
|
bool WriteAnchor(ostream_wrapper& out, const std::string& str) {
|
|
out << "&";
|
|
return WriteAliasName(out, str);
|
|
}
|
|
|
|
bool WriteTag(ostream_wrapper& out, const std::string& str, bool verbatim) {
|
|
out << (verbatim ? "!<" : "!");
|
|
StringCharSource buffer(str.c_str(), str.size());
|
|
const RegEx& reValid = verbatim ? Exp::URI() : Exp::Tag();
|
|
while (buffer) {
|
|
int n = reValid.Match(buffer);
|
|
if (n <= 0) {
|
|
return false;
|
|
}
|
|
|
|
while (--n >= 0) {
|
|
out << buffer[0];
|
|
++buffer;
|
|
}
|
|
}
|
|
if (verbatim) {
|
|
out << ">";
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool WriteTagWithPrefix(ostream_wrapper& out, const std::string& prefix,
|
|
const std::string& tag) {
|
|
out << "!";
|
|
StringCharSource prefixBuffer(prefix.c_str(), prefix.size());
|
|
while (prefixBuffer) {
|
|
int n = Exp::URI().Match(prefixBuffer);
|
|
if (n <= 0) {
|
|
return false;
|
|
}
|
|
|
|
while (--n >= 0) {
|
|
out << prefixBuffer[0];
|
|
++prefixBuffer;
|
|
}
|
|
}
|
|
|
|
out << "!";
|
|
StringCharSource tagBuffer(tag.c_str(), tag.size());
|
|
while (tagBuffer) {
|
|
int n = Exp::Tag().Match(tagBuffer);
|
|
if (n <= 0) {
|
|
return false;
|
|
}
|
|
|
|
while (--n >= 0) {
|
|
out << tagBuffer[0];
|
|
++tagBuffer;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool WriteBinary(ostream_wrapper& out, const Binary& binary) {
|
|
WriteDoubleQuotedString(out, EncodeBase64(binary.data(), binary.size()),
|
|
StringEscaping::None);
|
|
return true;
|
|
}
|
|
} // namespace Utils
|
|
} // namespace YAML
|