Files
espurna/code/espurna/terminal_parsing.cpp
Maxim Prokhorov 7595d3357c terminal: revert UnescapedText handling
Totally breaks on unicode input.

Main reason to add this in the first place was handling of
terminal escape sequences, but perhaps it would be better off to do
that on another layer.

For example, <Backspace> will inject an invisible symbol into the resulting
'chunk', at least one other thing to do is to validate after we append
things into an 'argv' that it at least looks like some kind of text.
2022-09-08 15:56:52 +03:00

506 lines
12 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
Part of the TERMINAL MODULE
Copyright (C) 2016-2019 by Xose Pérez <xose dot perez at gmail dot com>
Copyright (C) 2020 by Maxim Prokhorov <prokhorov dot max at outlook dot com>
*/
#include <vector>
#include <cctype>
#include "terminal_parsing.h"
namespace espurna {
namespace terminal {
namespace parser {
String error(Error value) {
String out;
switch (value) {
case Error::Ok:
out = PSTR("Ok");
break;
case Error::Uninitialized:
out = PSTR("Uninitialized");
break;
case Error::Busy:
out = PSTR("Busy");
break;
case Error::UnterminatedQuote:
out = PSTR("UnterminatedQuote");
break;
case Error::InvalidEscape:
out = PSTR("InvalidEscape");
break;
case Error::UnexpectedLineEnd:
out = PSTR("UnexpectedLineEnd");
break;
case Error::NoSpaceAfterQuote:
out = PSTR("NoSpaceAfterQuote");
break;
}
return out;
}
namespace {
// Original code is part of the SDSLib 2.0 -- A C dynamic strings library
// - https://github.com/antirez/sds/blob/master/sds.c
// - https://github.com/antirez/redis/blob/unstable/src/networking.c
// Replaced with a stateful parser to avoid random look-ahead issues in the code,
// and really make sure we **never** go out of bounds of the given view.
// (e.g. when we want to parse only a part of a larger string)
// Helper functions to handle \xHH codes that could encode
// non-printable characters for commands or arguments
bool is_hex_digit(char c) {
switch (c) {
case '0' ... '9':
case 'a' ... 'f':
case 'A' ... 'F':
return true;
}
return false;
}
char hex_digit_to_byte(char c) {
switch (c) {
case '0'...'9':
return c - '0';
case 'a':
case 'A':
return 10;
case 'b':
case 'B':
return 11;
case 'c':
case 'C':
return 12;
case 'd':
case 'D':
return 13;
case 'e':
case 'E':
return 14;
case 'f':
case 'F':
return 15;
}
return 0;
}
char hex_digit_to_value(char lhs, char rhs) {
return (hex_digit_to_byte(lhs) << 4) | hex_digit_to_byte(rhs);
}
// allowed 'special' input characters
char unescape_char(char c) {
switch (c) {
case 'n':
return '\n';
case 'r':
return '\r';
case 't':
return '\t';
case 'b':
return '\b';
case 'a':
return '\a';
}
return c;
}
struct Result {
Result() = default;
Result& operator=(Error error) {
_error = error;
_argv.clear();
return *this;
}
Result& operator=(Argv&& argv) {
_argv = std::move(argv);
_error = Error::Ok;
return *this;
}
explicit operator bool() const {
return _error == Error::Ok;
}
Error error() const {
return _error;
}
CommandLine get() {
auto out = CommandLine{
.argv = std::move(_argv),
.error = _error };
_error = Error::Uninitialized;
return out;
}
private:
Error _error { Error::Uninitialized };
Argv _argv;
};
struct Parser {
Parser() = default;
Result operator()(StringView);
private:
// only tracked within our `operator()(<LINE>)`
enum class State {
Done,
Initial,
Text,
CarriageReturn,
CarriageReturnAfterText,
SkipUntilNewLine,
EscapedText,
EscapedByteLhs,
EscapedByteRhs,
SingleQuote,
EscapedQuote,
DoubleQuote,
AfterQuote,
};
// our storage for
// - ARGV resulting list
// - text buffer or (interim) text span / range
// - escaped character (since we don't look ahead when iterating)
struct Values {
struct Span {
const char* begin { nullptr };
const char* end { nullptr };
};
Span span;
String chunk;
char byte_lhs { 0 };
Argv argv;
void append_span(const char* ptr) {
if (!span.begin) {
span.begin = ptr;
}
span.end = !span.end
? std::next(span.begin)
: std::next(ptr);
}
void push_span() {
if (span.begin && span.end) {
StringView view(span.begin, span.end);
chunk.concat(view.c_str(), view.length());
span = Values::Span{};
}
}
void append_chunk(char c) {
push_span();
chunk.concat(&c, 1);
}
void append_byte_lhs(char c) {
byte_lhs = c;
}
void append_byte_rhs(char c) {
append_chunk(hex_digit_to_value(byte_lhs, c));
}
void push_chunk() {
push_span();
argv.push_back(chunk);
chunk = "";
}
};
bool _parsing { false };
};
Result Parser::operator()(StringView line) {
Result result;
Values values;
State state { State::Initial };
ReentryLock lock(_parsing);
if (!lock.initialized()) {
result = Error::Busy;
goto out;
}
for (auto it = line.begin(); it != line.end(); ++it) {
switch (State(state)) {
case State::Initial:
switch (*it) {
case ' ':
case '\t':
break;
case '\r':
state = State::CarriageReturn;
break;
case '\n':
state = State::Done;
break;
default:
state = State::Text;
goto text;
}
break;
case State::Done:
goto out;
case State::Text:
text:
switch (*it) {
case ' ':
case '\t':
values.push_chunk();
state = State::Initial;
break;
case '"':
state = State::DoubleQuote;
break;
case '\'':
state = State::SingleQuote;
break;
case '\r':
state = State::CarriageReturnAfterText;
break;
case '\n':
values.push_chunk();
state = State::Done;
break;
default:
values.append_span(it);
break;
}
break;
case State::CarriageReturn:
if ((*it) == '\n') {
state = State::Done;
} else {
result = Error::UnexpectedLineEnd;
goto out;
}
break;
case State::CarriageReturnAfterText:
if ((*it) == '\n') {
values.push_chunk();
state = State::Done;
} else {
result = Error::UnexpectedLineEnd;
goto out;
}
break;
case State::SkipUntilNewLine:
switch (*it) {
case '\r':
state = State::CarriageReturn;
break;
case '\n':
state = State::Initial;
break;
}
break;
case State::EscapedText: {
switch (*it) {
case '\r':
case '\n':
result = Error::UnexpectedLineEnd;
goto out;
case 'x':
state = State::EscapedByteLhs;
break;
default:
values.append_chunk(unescape_char(*it));
break;
}
break;
}
case State::EscapedByteLhs:
if (is_hex_digit(*it)) {
values.append_byte_lhs(*it);
state = State::EscapedByteRhs;
} else {
result = Error::InvalidEscape;
goto out;
}
break;
case State::EscapedByteRhs:
if (is_hex_digit(*it)) {
values.append_byte_rhs(*it);
state = State::DoubleQuote;
} else {
result = Error::InvalidEscape;
goto out;
}
break;
case State::SingleQuote:
switch (*it) {
case '\r':
case '\n':
result = Error::UnterminatedQuote;
goto out;
case '\\':
state = State::EscapedQuote;
break;
case '\'':
state = State::AfterQuote;
break;
default:
values.append_span(it);
break;
}
break;
case State::EscapedQuote:
switch (*it) {
case '\'':
values.chunk.concat(*it);
state = State::SingleQuote;
break;
default:
result = Error::InvalidEscape;
goto out;
}
break;
case State::AfterQuote:
switch (*it) {
case '\r':
state = State::CarriageReturnAfterText;
break;
case ' ':
case '\t':
values.push_chunk();
state = State::Initial;
break;
case '\n':
values.push_chunk();
state = State::Done;
break;
default:
result = Error::NoSpaceAfterQuote;
goto out;
}
break;
case State::DoubleQuote:
switch (*it) {
case '\r':
case '\n':
result = Error::UnterminatedQuote;
goto out;
case '"':
state = State::AfterQuote;
break;
case '\\':
state = State::EscapedText;
break;
default:
values.append_span(it);
break;
}
break;
}
}
out:
if (state == State::Done) {
result = std::move(values.argv);
}
// whenever line ends before we are done parsing, make sure
// result contains a valid error condition (same as in the switch above)
if (result.error() == Error::Uninitialized) {
switch (state) {
case State::Done:
break;
case State::CarriageReturn:
case State::CarriageReturnAfterText:
case State::Text:
case State::Initial:
case State::SkipUntilNewLine:
result = Error::UnexpectedLineEnd;
break;
case State::EscapedByteLhs:
case State::EscapedByteRhs:
case State::EscapedText:
case State::EscapedQuote:
result = Error::InvalidEscape;
break;
case State::SingleQuote:
case State::DoubleQuote:
result = Error::UnterminatedQuote;
break;
case State::AfterQuote:
result = Error::NoSpaceAfterQuote;
break;
}
}
return result;
}
CommandLine parse_line(StringView line) {
static Parser parser;
return parser(line).get();
}
} // namespace
// FowlerNollVo hash function to hash command strings that treats input as lowercase
// ref: https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
//
// This is here in case `std::unordered_map` becomes viable
// TODO: afaik, map implementation should handle collisions (however rare they are in our case)
// if not, we can always roll static commands allocation and just match strings with strcmp_P
uint32_t lowercase_fnv1_hash(StringView value) {
constexpr uint32_t fnv_prime = 16777619u;
constexpr uint32_t fnv_basis = 2166136261u;
uint32_t hash = fnv_basis;
for (auto it = value.begin(); it != value.end(); ++it) {
hash = hash ^ static_cast<uint32_t>(tolower(pgm_read_byte(it)));
hash = hash * fnv_prime;
}
return hash;
}
} // namespace parser
CommandLine parse_line(StringView value) {
return parser::parse_line(value);
}
} // namespace terminal
} // namespace espurna