From 1c676f954829c229b67e24e8ece10337d0dd94d9 Mon Sep 17 00:00:00 2001 From: Riyyi Date: Tue, 7 Jun 2022 12:38:12 +0200 Subject: [PATCH] Util: Add JSON lexical analyzer --- src/util/json/lexer.cpp | 254 ++++++++++++++++++++++++++++++++++++++++ src/util/json/lexer.h | 70 +++++++++++ 2 files changed, 324 insertions(+) create mode 100644 src/util/json/lexer.cpp create mode 100644 src/util/json/lexer.h diff --git a/src/util/json/lexer.cpp b/src/util/json/lexer.cpp new file mode 100644 index 0000000..db4f934 --- /dev/null +++ b/src/util/json/lexer.cpp @@ -0,0 +1,254 @@ +/* + * Copyright (C) 2022 Riyyi + * + * SPDX-License-Identifier: MIT + */ + +#include +#include + +#include "util/json/lexer.h" + +namespace Json { + +Lexer::Lexer(const std::string& input) + : m_input(input) +{ +} + +Lexer::~Lexer() +{ +} + +// ----------------------------------------- + +void Lexer::analyze() +{ + printf("---------\n"); + printf("Input JSON:\n%s\n", m_input.c_str()); + printf("---------\n"); + printf("Lexing:\n"); + + while (m_index < m_input.length()) { + switch (peek()) { + case '{': + printf("Pushing -> BraceOpen: \"{\"\t%zu[%zu]\n", m_line, m_column); + m_tokens.push_back({ Token::Type::BraceOpen, m_line, m_column, "" }); + break; + case '}': + printf("Pushing -> BraceClose: \"}\"\t%zu[%zu]\n", m_line, m_column); + m_tokens.push_back({ Token::Type::BraceClose, m_line, m_column, "" }); + break; + case '[': + printf("Pushing -> BracketOpen: \"[\"\t%zu[%zu]\n", m_line, m_column); + m_tokens.push_back({ Token::Type::BracketOpen, m_line, m_column, "" }); + break; + case ']': + printf("Pushing -> BracketClose: \"]\"\t%zu[%zu]\n", m_line, m_column); + m_tokens.push_back({ Token::Type::BracketClose, m_line, m_column, "" }); + break; + case ':': + printf("Pushing -> Colon: \":\"\t%zu[%zu]\n", m_line, m_column); + m_tokens.push_back({ Token::Type::Colon, m_line, m_column, "" }); + break; + case ',': + printf("Pushing -> Comma: \",\"\t%zu[%zu]\n", m_line, m_column); + m_tokens.push_back({ Token::Type::Comma, m_line, m_column, "" }); + break; + case '"': + if (!getString()) { + // Error! + printf("Invalid JSON!\n"); + return; + } + break; + case '-': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (!getNumber()) { + // Error! + printf("Invalid JSON!\n"); + return; + } + break; + case 'f': + case 'n': + case 't': + if (!getLiteral()) { + // Error! + printf("Invalid JSON!\n"); + return; + } + break; + case ' ': + case '\t': + break; + case '\r': + if (peekNext() == '\n') { // CRLF \r\n + break; + } + m_column = 0; + m_line++; + break; + case '\n': + m_column = 0; + m_line++; + break; + default: + // Error! + printf("Invalid JSON!\n"); + return; + break; + } + + m_index++; + m_column++; + } +} + +// ----------------------------------------- + +char Lexer::peek() +{ + return m_input[m_index]; +} + +char Lexer::peekNext() +{ + return m_input[m_index + 1]; +} + +char Lexer::consume() +{ + char character = peek(); + m_index++; + m_column++; + return character; +} + +bool Lexer::consumeSpecific(char character) +{ + if (peek() != character) { + return false; + } + + m_index++; + m_column++; + return true; +} + +bool Lexer::getString() +{ + size_t column = m_column; + std::string symbol = ""; + + char character = consume(); + for (;;) { + character = peek(); + + // TODO: Escape logic goes here + // ", \, /, b(ackspace), f(orm feed), l(ine feed), c(arriage return), t(ab), u(nicode) \u0021 + + if (character == '"') { + break; + } + + m_index++; + m_column++; + symbol += character; + } + + printf("Pushing -> String: \"%s\"\t%zu[%zu]\n", symbol.c_str(), m_line, column); + m_tokens.push_back({ Token::Type::String, m_line, column, symbol }); + + return true; +} + +bool Lexer::getNumber() +{ + size_t index = m_index; + size_t column = m_column; + std::string symbol = ""; + + bool seenDot = false; + char character; + for (;;) { + character = peek(); + + // FIXME: Break on separator }], rather than valid number symbols to + // get the entire thing, resulting in better error handling + // FIXME: e/E and exponent are also valid characters (?) + if (character != 45 // - + && character != 46 // . + && (character < 48 || character > 57)) { // 0-9 + break; + } + + // Fail if '.' is used more than once + if (seenDot == true && character == 46) { // . + m_index = index; + m_column = column; + return false; + } + if (character == 46) { // . + seenDot = true; + } + + m_index++; + m_column++; + symbol += character; + } + m_index--; + m_column--; + + printf("Pushing -> Number: \"%s\"\t%zu[%zu]\n", symbol.c_str(), m_line, column); + m_tokens.push_back({ Token::Type::Number, m_line, column, symbol }); + + return true; +} + +bool Lexer::getLiteral() +{ + size_t index = m_index; + size_t column = m_column; + + std::string symbol = ""; + + char character; + for (;;) { + character = peek(); + + // Literals can only contain lower-case letters + if (character < 97 || character > 122) { // a-z + break; + } + + m_index++; + m_column++; + symbol += character; + } + m_index--; + m_column--; + + // Literal name validation + if (symbol != "false" && symbol != "null" && symbol != "true") { + m_index = index; + m_column = column; + return false; + } + + printf("Pushing -> Literal: \"%s\"\t%zu[%zu]\n", symbol.c_str(), m_line, column); + m_tokens.push_back({ Token::Type::Literal, m_line, column, symbol }); + + return true; +} + +} // namespace Json diff --git a/src/util/json/lexer.h b/src/util/json/lexer.h new file mode 100644 index 0000000..c743b76 --- /dev/null +++ b/src/util/json/lexer.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2022 Riyyi + * + * SPDX-License-Identifier: MIT + */ + +#ifndef JSON_LEXER_H +#define JSON_LEXER_H + +// The JavaScript Object Notation (JSON) Data Interchange Format +// home/rick/code/cpp/manafiles/ https://www.rfc-editor.org/rfc/pdfrfc/rfc8259.txt.pdf + +#include // size_t +#include +#include + +namespace Json { + +struct Token { + enum class Type { + None, + BraceOpen, // { + BraceClose, // } + BracketOpen, // [ + BracketClose, // ] + Colon, // : + Comma, // , + String, // "foobar" + Number, // 123.456 + Literal, // false/null/true (case sensitive) + }; + + Type type { Type::None }; + size_t line { 0 }; + size_t column { 0 }; + std::string symbol; +}; + +// Lexical analyzer +class Lexer { +public: + Lexer(const std::string& input); + virtual ~Lexer(); + + void analyze(); + + const std::vector& tokens() const { return m_tokens; } + +private: + char peek(); + char peekNext(); + + char consume(); + bool consumeSpecific(char character); + + bool getString(); + bool getNumber(); + bool getLiteral(); + + std::string m_input; + size_t m_index { 0 }; + size_t m_column { 1 }; + size_t m_line { 1 }; + + std::vector m_tokens; +}; + +} // namespace Json + +#endif // JSON_LEXER_H