Util: Add JSON lexical analyzer

3 years ago · 1c676f9548
2 changed files with 324 additions and 0 deletions
--- a/src/util/json/lexer.cpp
+++ b/src/util/json/lexer.cpp
@ -0,0 +1,254 @@
+/*
+ * Copyright (C) 2022 Riyyi
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <cstddef>
+#include <string>
+
+#include "util/json/lexer.h"
+
+namespace Json {
+
+Lexer::Lexer(const std::string& input)
+	: m_input(input)
+{
+}
+
+Lexer::~Lexer()
+{
+}
+
+// -----------------------------------------
+
+void Lexer::analyze()
+{
+	printf("---------\n");
+	printf("Input JSON:\n%s\n", m_input.c_str());
+	printf("---------\n");
+	printf("Lexing:\n");
+
+	while (m_index < m_input.length()) {
+		switch (peek()) {
+		case '{':
+			printf("Pushing ->    BraceOpen:  \"{\"\t%zu[%zu]\n", m_line, m_column);
+			m_tokens.push_back({ Token::Type::BraceOpen, m_line, m_column, "" });
+			break;
+		case '}':
+			printf("Pushing ->   BraceClose:  \"}\"\t%zu[%zu]\n", m_line, m_column);
+			m_tokens.push_back({ Token::Type::BraceClose, m_line, m_column, "" });
+			break;
+		case '[':
+			printf("Pushing ->  BracketOpen:  \"[\"\t%zu[%zu]\n", m_line, m_column);
+			m_tokens.push_back({ Token::Type::BracketOpen, m_line, m_column, "" });
+			break;
+		case ']':
+			printf("Pushing -> BracketClose:  \"]\"\t%zu[%zu]\n", m_line, m_column);
+			m_tokens.push_back({ Token::Type::BracketClose, m_line, m_column, "" });
+			break;
+		case ':':
+			printf("Pushing ->        Colon:  \":\"\t%zu[%zu]\n", m_line, m_column);
+			m_tokens.push_back({ Token::Type::Colon, m_line, m_column, "" });
+			break;
+		case ',':
+			printf("Pushing ->        Comma:  \",\"\t%zu[%zu]\n", m_line, m_column);
+			m_tokens.push_back({ Token::Type::Comma, m_line, m_column, "" });
+			break;
+		case '"':
+			if (!getString()) {
+				// Error!
+				printf("Invalid JSON!\n");
+				return;
+			}
+			break;
+		case '-':
+		case '0':
+		case '1':
+		case '2':
+		case '3':
+		case '4':
+		case '5':
+		case '6':
+		case '7':
+		case '8':
+		case '9':
+			if (!getNumber()) {
+				// Error!
+				printf("Invalid JSON!\n");
+				return;
+			}
+			break;
+		case 'f':
+		case 'n':
+		case 't':
+			if (!getLiteral()) {
+				// Error!
+				printf("Invalid JSON!\n");
+				return;
+			}
+			break;
+		case ' ':
+		case '\t':
+			break;
+		case '\r':
+			if (peekNext() == '\n') { // CRLF \r\n
+				break;
+			}
+			m_column = 0;
+			m_line++;
+			break;
+		case '\n':
+			m_column = 0;
+			m_line++;
+			break;
+		default:
+			// Error!
+			printf("Invalid JSON!\n");
+			return;
+			break;
+		}
+
+		m_index++;
+		m_column++;
+	}
+}
+
+// -----------------------------------------
+
+char Lexer::peek()
+{
+	return m_input[m_index];
+}
+
+char Lexer::peekNext()
+{
+	return m_input[m_index + 1];
+}
+
+char Lexer::consume()
+{
+	char character = peek();
+	m_index++;
+	m_column++;
+	return character;
+}
+
+bool Lexer::consumeSpecific(char character)
+{
+	if (peek() != character) {
+		return false;
+	}
+
+	m_index++;
+	m_column++;
+	return true;
+}
+
+bool Lexer::getString()
+{
+	size_t column = m_column;
+	std::string symbol = "";
+
+	char character = consume();
+	for (;;) {
+		character = peek();
+
+		// TODO: Escape logic goes here
+		// ", \, /, b(ackspace), f(orm feed), l(ine feed), c(arriage return), t(ab), u(nicode) \u0021
+
+		if (character == '"') {
+			break;
+		}
+
+		m_index++;
+		m_column++;
+		symbol += character;
+	}
+
+	printf("Pushing ->       String:  \"%s\"\t%zu[%zu]\n", symbol.c_str(), m_line, column);
+	m_tokens.push_back({ Token::Type::String, m_line, column, symbol });
+
+	return true;
+}
+
+bool Lexer::getNumber()
+{
+	size_t index = m_index;
+	size_t column = m_column;
+	std::string symbol = "";
+
+	bool seenDot = false;
+	char character;
+	for (;;) {
+		character = peek();
+
+		// FIXME: Break on separator }], rather than valid number symbols to
+		//        get the entire thing, resulting in better error handling
+		// FIXME: e/E and exponent are also valid characters (?)
+		if (character != 45                          // -
+		    && character != 46                       // .
+		    && (character < 48 || character > 57)) { // 0-9
+			break;
+		}
+
+		// Fail if '.' is used more than once
+		if (seenDot == true && character == 46) { // .
+			m_index = index;
+			m_column = column;
+			return false;
+		}
+		if (character == 46) { // .
+			seenDot = true;
+		}
+
+		m_index++;
+		m_column++;
+		symbol += character;
+	}
+	m_index--;
+	m_column--;
+
+	printf("Pushing ->       Number:  \"%s\"\t%zu[%zu]\n", symbol.c_str(), m_line, column);
+	m_tokens.push_back({ Token::Type::Number, m_line, column, symbol });
+
+	return true;
+}
+
+bool Lexer::getLiteral()
+{
+	size_t index = m_index;
+	size_t column = m_column;
+
+	std::string symbol = "";
+
+	char character;
+	for (;;) {
+		character = peek();
+
+		// Literals can only contain lower-case letters
+		if (character < 97 || character > 122) { // a-z
+			break;
+		}
+
+		m_index++;
+		m_column++;
+		symbol += character;
+	}
+	m_index--;
+	m_column--;
+
+	// Literal name validation
+	if (symbol != "false" && symbol != "null" && symbol != "true") {
+		m_index = index;
+		m_column = column;
+		return false;
+	}
+
+	printf("Pushing ->      Literal:  \"%s\"\t%zu[%zu]\n", symbol.c_str(), m_line, column);
+	m_tokens.push_back({ Token::Type::Literal, m_line, column, symbol });
+
+	return true;
+}
+
+} // namespace Json
--- a/src/util/json/lexer.h
+++ b/src/util/json/lexer.h
@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2022 Riyyi
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef JSON_LEXER_H
+#define JSON_LEXER_H
+
+// The JavaScript Object Notation (JSON) Data Interchange Format
+// home/rick/code/cpp/manafiles/ https://www.rfc-editor.org/rfc/pdfrfc/rfc8259.txt.pdf
+
+#include <cstddef> // size_t
+#include <string>
+#include <vector>
+
+namespace Json {
+
+struct Token {
+	enum class Type {
+		None,
+		BraceOpen,    // {
+		BraceClose,   // }
+		BracketOpen,  // [
+		BracketClose, // ]
+		Colon,        // :
+		Comma,        // ,
+		String,       // "foobar"
+		Number,       // 123.456
+		Literal,      // false/null/true (case sensitive)
+	};
+
+	Type type { Type::None };
+	size_t line { 0 };
+	size_t column { 0 };
+	std::string symbol;
+};
+
+// Lexical analyzer
+class Lexer {
+public:
+	Lexer(const std::string& input);
+	virtual ~Lexer();
+
+	void analyze();
+
+	const std::vector<Token>& tokens() const { return m_tokens; }
+
+private:
+	char peek();
+	char peekNext();
+
+	char consume();
+	bool consumeSpecific(char character);
+
+	bool getString();
+	bool getNumber();
+	bool getLiteral();
+
+	std::string m_input;
+	size_t m_index { 0 };
+	size_t m_column { 1 };
+	size_t m_line { 1 };
+
+	std::vector<Token> m_tokens;
+};
+
+} // namespace Json
+
+#endif // JSON_LEXER_H