2 changed files with 324 additions and 0 deletions
			
			
		@ -0,0 +1,254 @@
					 | 
				
			||||
/*
 | 
				
			||||
 * Copyright (C) 2022 Riyyi | 
				
			||||
 * | 
				
			||||
 * SPDX-License-Identifier: MIT | 
				
			||||
 */ | 
				
			||||
 | 
				
			||||
#include <cstddef> | 
				
			||||
#include <string> | 
				
			||||
 | 
				
			||||
#include "util/json/lexer.h" | 
				
			||||
 | 
				
			||||
namespace Json { | 
				
			||||
 | 
				
			||||
Lexer::Lexer(const std::string& input) | 
				
			||||
	: m_input(input) | 
				
			||||
{ | 
				
			||||
} | 
				
			||||
 | 
				
			||||
Lexer::~Lexer() | 
				
			||||
{ | 
				
			||||
} | 
				
			||||
 | 
				
			||||
// -----------------------------------------
 | 
				
			||||
 | 
				
			||||
void Lexer::analyze() | 
				
			||||
{ | 
				
			||||
	printf("---------\n"); | 
				
			||||
	printf("Input JSON:\n%s\n", m_input.c_str()); | 
				
			||||
	printf("---------\n"); | 
				
			||||
	printf("Lexing:\n"); | 
				
			||||
 | 
				
			||||
	while (m_index < m_input.length()) { | 
				
			||||
		switch (peek()) { | 
				
			||||
		case '{': | 
				
			||||
			printf("Pushing ->    BraceOpen:  \"{\"\t%zu[%zu]\n", m_line, m_column); | 
				
			||||
			m_tokens.push_back({ Token::Type::BraceOpen, m_line, m_column, "" }); | 
				
			||||
			break; | 
				
			||||
		case '}': | 
				
			||||
			printf("Pushing ->   BraceClose:  \"}\"\t%zu[%zu]\n", m_line, m_column); | 
				
			||||
			m_tokens.push_back({ Token::Type::BraceClose, m_line, m_column, "" }); | 
				
			||||
			break; | 
				
			||||
		case '[': | 
				
			||||
			printf("Pushing ->  BracketOpen:  \"[\"\t%zu[%zu]\n", m_line, m_column); | 
				
			||||
			m_tokens.push_back({ Token::Type::BracketOpen, m_line, m_column, "" }); | 
				
			||||
			break; | 
				
			||||
		case ']': | 
				
			||||
			printf("Pushing -> BracketClose:  \"]\"\t%zu[%zu]\n", m_line, m_column); | 
				
			||||
			m_tokens.push_back({ Token::Type::BracketClose, m_line, m_column, "" }); | 
				
			||||
			break; | 
				
			||||
		case ':': | 
				
			||||
			printf("Pushing ->        Colon:  \":\"\t%zu[%zu]\n", m_line, m_column); | 
				
			||||
			m_tokens.push_back({ Token::Type::Colon, m_line, m_column, "" }); | 
				
			||||
			break; | 
				
			||||
		case ',': | 
				
			||||
			printf("Pushing ->        Comma:  \",\"\t%zu[%zu]\n", m_line, m_column); | 
				
			||||
			m_tokens.push_back({ Token::Type::Comma, m_line, m_column, "" }); | 
				
			||||
			break; | 
				
			||||
		case '"': | 
				
			||||
			if (!getString()) { | 
				
			||||
				// Error!
 | 
				
			||||
				printf("Invalid JSON!\n"); | 
				
			||||
				return; | 
				
			||||
			} | 
				
			||||
			break; | 
				
			||||
		case '-': | 
				
			||||
		case '0': | 
				
			||||
		case '1': | 
				
			||||
		case '2': | 
				
			||||
		case '3': | 
				
			||||
		case '4': | 
				
			||||
		case '5': | 
				
			||||
		case '6': | 
				
			||||
		case '7': | 
				
			||||
		case '8': | 
				
			||||
		case '9': | 
				
			||||
			if (!getNumber()) { | 
				
			||||
				// Error!
 | 
				
			||||
				printf("Invalid JSON!\n"); | 
				
			||||
				return; | 
				
			||||
			} | 
				
			||||
			break; | 
				
			||||
		case 'f': | 
				
			||||
		case 'n': | 
				
			||||
		case 't': | 
				
			||||
			if (!getLiteral()) { | 
				
			||||
				// Error!
 | 
				
			||||
				printf("Invalid JSON!\n"); | 
				
			||||
				return; | 
				
			||||
			} | 
				
			||||
			break; | 
				
			||||
		case ' ': | 
				
			||||
		case '\t': | 
				
			||||
			break; | 
				
			||||
		case '\r': | 
				
			||||
			if (peekNext() == '\n') { // CRLF \r\n
 | 
				
			||||
				break; | 
				
			||||
			} | 
				
			||||
			m_column = 0; | 
				
			||||
			m_line++; | 
				
			||||
			break; | 
				
			||||
		case '\n': | 
				
			||||
			m_column = 0; | 
				
			||||
			m_line++; | 
				
			||||
			break; | 
				
			||||
		default: | 
				
			||||
			// Error!
 | 
				
			||||
			printf("Invalid JSON!\n"); | 
				
			||||
			return; | 
				
			||||
			break; | 
				
			||||
		} | 
				
			||||
 | 
				
			||||
		m_index++; | 
				
			||||
		m_column++; | 
				
			||||
	} | 
				
			||||
} | 
				
			||||
 | 
				
			||||
// -----------------------------------------
 | 
				
			||||
 | 
				
			||||
char Lexer::peek() | 
				
			||||
{ | 
				
			||||
	return m_input[m_index]; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
char Lexer::peekNext() | 
				
			||||
{ | 
				
			||||
	return m_input[m_index + 1]; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
char Lexer::consume() | 
				
			||||
{ | 
				
			||||
	char character = peek(); | 
				
			||||
	m_index++; | 
				
			||||
	m_column++; | 
				
			||||
	return character; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
bool Lexer::consumeSpecific(char character) | 
				
			||||
{ | 
				
			||||
	if (peek() != character) { | 
				
			||||
		return false; | 
				
			||||
	} | 
				
			||||
 | 
				
			||||
	m_index++; | 
				
			||||
	m_column++; | 
				
			||||
	return true; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
bool Lexer::getString() | 
				
			||||
{ | 
				
			||||
	size_t column = m_column; | 
				
			||||
	std::string symbol = ""; | 
				
			||||
 | 
				
			||||
	char character = consume(); | 
				
			||||
	for (;;) { | 
				
			||||
		character = peek(); | 
				
			||||
 | 
				
			||||
		// TODO: Escape logic goes here
 | 
				
			||||
		// ", \, /, b(ackspace), f(orm feed), l(ine feed), c(arriage return), t(ab), u(nicode) \u0021
 | 
				
			||||
 | 
				
			||||
		if (character == '"') { | 
				
			||||
			break; | 
				
			||||
		} | 
				
			||||
 | 
				
			||||
		m_index++; | 
				
			||||
		m_column++; | 
				
			||||
		symbol += character; | 
				
			||||
	} | 
				
			||||
 | 
				
			||||
	printf("Pushing ->       String:  \"%s\"\t%zu[%zu]\n", symbol.c_str(), m_line, column); | 
				
			||||
	m_tokens.push_back({ Token::Type::String, m_line, column, symbol }); | 
				
			||||
 | 
				
			||||
	return true; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
bool Lexer::getNumber() | 
				
			||||
{ | 
				
			||||
	size_t index = m_index; | 
				
			||||
	size_t column = m_column; | 
				
			||||
	std::string symbol = ""; | 
				
			||||
 | 
				
			||||
	bool seenDot = false; | 
				
			||||
	char character; | 
				
			||||
	for (;;) { | 
				
			||||
		character = peek(); | 
				
			||||
 | 
				
			||||
		// FIXME: Break on separator }], rather than valid number symbols to
 | 
				
			||||
		//        get the entire thing, resulting in better error handling
 | 
				
			||||
		// FIXME: e/E and exponent are also valid characters (?)
 | 
				
			||||
		if (character != 45                          // -
 | 
				
			||||
		    && character != 46                       // .
 | 
				
			||||
		    && (character < 48 || character > 57)) { // 0-9
 | 
				
			||||
			break; | 
				
			||||
		} | 
				
			||||
 | 
				
			||||
		// Fail if '.' is used more than once
 | 
				
			||||
		if (seenDot == true && character == 46) { // .
 | 
				
			||||
			m_index = index; | 
				
			||||
			m_column = column; | 
				
			||||
			return false; | 
				
			||||
		} | 
				
			||||
		if (character == 46) { // .
 | 
				
			||||
			seenDot = true; | 
				
			||||
		} | 
				
			||||
 | 
				
			||||
		m_index++; | 
				
			||||
		m_column++; | 
				
			||||
		symbol += character; | 
				
			||||
	} | 
				
			||||
	m_index--; | 
				
			||||
	m_column--; | 
				
			||||
 | 
				
			||||
	printf("Pushing ->       Number:  \"%s\"\t%zu[%zu]\n", symbol.c_str(), m_line, column); | 
				
			||||
	m_tokens.push_back({ Token::Type::Number, m_line, column, symbol }); | 
				
			||||
 | 
				
			||||
	return true; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
bool Lexer::getLiteral() | 
				
			||||
{ | 
				
			||||
	size_t index = m_index; | 
				
			||||
	size_t column = m_column; | 
				
			||||
 | 
				
			||||
	std::string symbol = ""; | 
				
			||||
 | 
				
			||||
	char character; | 
				
			||||
	for (;;) { | 
				
			||||
		character = peek(); | 
				
			||||
 | 
				
			||||
		// Literals can only contain lower-case letters
 | 
				
			||||
		if (character < 97 || character > 122) { // a-z
 | 
				
			||||
			break; | 
				
			||||
		} | 
				
			||||
 | 
				
			||||
		m_index++; | 
				
			||||
		m_column++; | 
				
			||||
		symbol += character; | 
				
			||||
	} | 
				
			||||
	m_index--; | 
				
			||||
	m_column--; | 
				
			||||
 | 
				
			||||
	// Literal name validation
 | 
				
			||||
	if (symbol != "false" && symbol != "null" && symbol != "true") { | 
				
			||||
		m_index = index; | 
				
			||||
		m_column = column; | 
				
			||||
		return false; | 
				
			||||
	} | 
				
			||||
 | 
				
			||||
	printf("Pushing ->      Literal:  \"%s\"\t%zu[%zu]\n", symbol.c_str(), m_line, column); | 
				
			||||
	m_tokens.push_back({ Token::Type::Literal, m_line, column, symbol }); | 
				
			||||
 | 
				
			||||
	return true; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
} // namespace Json
 | 
				
			||||
@ -0,0 +1,70 @@
					 | 
				
			||||
/*
 | 
				
			||||
 * Copyright (C) 2022 Riyyi | 
				
			||||
 * | 
				
			||||
 * SPDX-License-Identifier: MIT | 
				
			||||
 */ | 
				
			||||
 | 
				
			||||
#ifndef JSON_LEXER_H | 
				
			||||
#define JSON_LEXER_H | 
				
			||||
 | 
				
			||||
// The JavaScript Object Notation (JSON) Data Interchange Format
 | 
				
			||||
// home/rick/code/cpp/manafiles/ https://www.rfc-editor.org/rfc/pdfrfc/rfc8259.txt.pdf
 | 
				
			||||
 | 
				
			||||
#include <cstddef> // size_t | 
				
			||||
#include <string> | 
				
			||||
#include <vector> | 
				
			||||
 | 
				
			||||
namespace Json { | 
				
			||||
 | 
				
			||||
struct Token { | 
				
			||||
	enum class Type { | 
				
			||||
		None, | 
				
			||||
		BraceOpen,    // {
 | 
				
			||||
		BraceClose,   // }
 | 
				
			||||
		BracketOpen,  // [
 | 
				
			||||
		BracketClose, // ]
 | 
				
			||||
		Colon,        // :
 | 
				
			||||
		Comma,        // ,
 | 
				
			||||
		String,       // "foobar"
 | 
				
			||||
		Number,       // 123.456
 | 
				
			||||
		Literal,      // false/null/true (case sensitive)
 | 
				
			||||
	}; | 
				
			||||
 | 
				
			||||
	Type type { Type::None }; | 
				
			||||
	size_t line { 0 }; | 
				
			||||
	size_t column { 0 }; | 
				
			||||
	std::string symbol; | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
// Lexical analyzer
 | 
				
			||||
class Lexer { | 
				
			||||
public: | 
				
			||||
	Lexer(const std::string& input); | 
				
			||||
	virtual ~Lexer(); | 
				
			||||
 | 
				
			||||
	void analyze(); | 
				
			||||
 | 
				
			||||
	const std::vector<Token>& tokens() const { return m_tokens; } | 
				
			||||
 | 
				
			||||
private: | 
				
			||||
	char peek(); | 
				
			||||
	char peekNext(); | 
				
			||||
 | 
				
			||||
	char consume(); | 
				
			||||
	bool consumeSpecific(char character); | 
				
			||||
 | 
				
			||||
	bool getString(); | 
				
			||||
	bool getNumber(); | 
				
			||||
	bool getLiteral(); | 
				
			||||
 | 
				
			||||
	std::string m_input; | 
				
			||||
	size_t m_index { 0 }; | 
				
			||||
	size_t m_column { 1 }; | 
				
			||||
	size_t m_line { 1 }; | 
				
			||||
 | 
				
			||||
	std::vector<Token> m_tokens; | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
} // namespace Json
 | 
				
			||||
 | 
				
			||||
#endif // JSON_LEXER_H
 | 
				
			||||
					Loading…
					
					
				
		Reference in new issue