Riyyi
3 years ago
2 changed files with 324 additions and 0 deletions
@ -0,0 +1,254 @@
|
||||
/*
|
||||
* Copyright (C) 2022 Riyyi |
||||
* |
||||
* SPDX-License-Identifier: MIT |
||||
*/ |
||||
|
||||
#include <cstddef> |
||||
#include <string> |
||||
|
||||
#include "util/json/lexer.h" |
||||
|
||||
namespace Json { |
||||
|
||||
Lexer::Lexer(const std::string& input) |
||||
: m_input(input) |
||||
{ |
||||
} |
||||
|
||||
Lexer::~Lexer() |
||||
{ |
||||
} |
||||
|
||||
// -----------------------------------------
|
||||
|
||||
void Lexer::analyze() |
||||
{ |
||||
printf("---------\n"); |
||||
printf("Input JSON:\n%s\n", m_input.c_str()); |
||||
printf("---------\n"); |
||||
printf("Lexing:\n"); |
||||
|
||||
while (m_index < m_input.length()) { |
||||
switch (peek()) { |
||||
case '{': |
||||
printf("Pushing -> BraceOpen: \"{\"\t%zu[%zu]\n", m_line, m_column); |
||||
m_tokens.push_back({ Token::Type::BraceOpen, m_line, m_column, "" }); |
||||
break; |
||||
case '}': |
||||
printf("Pushing -> BraceClose: \"}\"\t%zu[%zu]\n", m_line, m_column); |
||||
m_tokens.push_back({ Token::Type::BraceClose, m_line, m_column, "" }); |
||||
break; |
||||
case '[': |
||||
printf("Pushing -> BracketOpen: \"[\"\t%zu[%zu]\n", m_line, m_column); |
||||
m_tokens.push_back({ Token::Type::BracketOpen, m_line, m_column, "" }); |
||||
break; |
||||
case ']': |
||||
printf("Pushing -> BracketClose: \"]\"\t%zu[%zu]\n", m_line, m_column); |
||||
m_tokens.push_back({ Token::Type::BracketClose, m_line, m_column, "" }); |
||||
break; |
||||
case ':': |
||||
printf("Pushing -> Colon: \":\"\t%zu[%zu]\n", m_line, m_column); |
||||
m_tokens.push_back({ Token::Type::Colon, m_line, m_column, "" }); |
||||
break; |
||||
case ',': |
||||
printf("Pushing -> Comma: \",\"\t%zu[%zu]\n", m_line, m_column); |
||||
m_tokens.push_back({ Token::Type::Comma, m_line, m_column, "" }); |
||||
break; |
||||
case '"': |
||||
if (!getString()) { |
||||
// Error!
|
||||
printf("Invalid JSON!\n"); |
||||
return; |
||||
} |
||||
break; |
||||
case '-': |
||||
case '0': |
||||
case '1': |
||||
case '2': |
||||
case '3': |
||||
case '4': |
||||
case '5': |
||||
case '6': |
||||
case '7': |
||||
case '8': |
||||
case '9': |
||||
if (!getNumber()) { |
||||
// Error!
|
||||
printf("Invalid JSON!\n"); |
||||
return; |
||||
} |
||||
break; |
||||
case 'f': |
||||
case 'n': |
||||
case 't': |
||||
if (!getLiteral()) { |
||||
// Error!
|
||||
printf("Invalid JSON!\n"); |
||||
return; |
||||
} |
||||
break; |
||||
case ' ': |
||||
case '\t': |
||||
break; |
||||
case '\r': |
||||
if (peekNext() == '\n') { // CRLF \r\n
|
||||
break; |
||||
} |
||||
m_column = 0; |
||||
m_line++; |
||||
break; |
||||
case '\n': |
||||
m_column = 0; |
||||
m_line++; |
||||
break; |
||||
default: |
||||
// Error!
|
||||
printf("Invalid JSON!\n"); |
||||
return; |
||||
break; |
||||
} |
||||
|
||||
m_index++; |
||||
m_column++; |
||||
} |
||||
} |
||||
|
||||
// -----------------------------------------
|
||||
|
||||
char Lexer::peek() |
||||
{ |
||||
return m_input[m_index]; |
||||
} |
||||
|
||||
char Lexer::peekNext() |
||||
{ |
||||
return m_input[m_index + 1]; |
||||
} |
||||
|
||||
char Lexer::consume() |
||||
{ |
||||
char character = peek(); |
||||
m_index++; |
||||
m_column++; |
||||
return character; |
||||
} |
||||
|
||||
bool Lexer::consumeSpecific(char character) |
||||
{ |
||||
if (peek() != character) { |
||||
return false; |
||||
} |
||||
|
||||
m_index++; |
||||
m_column++; |
||||
return true; |
||||
} |
||||
|
||||
bool Lexer::getString() |
||||
{ |
||||
size_t column = m_column; |
||||
std::string symbol = ""; |
||||
|
||||
char character = consume(); |
||||
for (;;) { |
||||
character = peek(); |
||||
|
||||
// TODO: Escape logic goes here
|
||||
// ", \, /, b(ackspace), f(orm feed), l(ine feed), c(arriage return), t(ab), u(nicode) \u0021
|
||||
|
||||
if (character == '"') { |
||||
break; |
||||
} |
||||
|
||||
m_index++; |
||||
m_column++; |
||||
symbol += character; |
||||
} |
||||
|
||||
printf("Pushing -> String: \"%s\"\t%zu[%zu]\n", symbol.c_str(), m_line, column); |
||||
m_tokens.push_back({ Token::Type::String, m_line, column, symbol }); |
||||
|
||||
return true; |
||||
} |
||||
|
||||
bool Lexer::getNumber() |
||||
{ |
||||
size_t index = m_index; |
||||
size_t column = m_column; |
||||
std::string symbol = ""; |
||||
|
||||
bool seenDot = false; |
||||
char character; |
||||
for (;;) { |
||||
character = peek(); |
||||
|
||||
// FIXME: Break on separator }], rather than valid number symbols to
|
||||
// get the entire thing, resulting in better error handling
|
||||
// FIXME: e/E and exponent are also valid characters (?)
|
||||
if (character != 45 // -
|
||||
&& character != 46 // .
|
||||
&& (character < 48 || character > 57)) { // 0-9
|
||||
break; |
||||
} |
||||
|
||||
// Fail if '.' is used more than once
|
||||
if (seenDot == true && character == 46) { // .
|
||||
m_index = index; |
||||
m_column = column; |
||||
return false; |
||||
} |
||||
if (character == 46) { // .
|
||||
seenDot = true; |
||||
} |
||||
|
||||
m_index++; |
||||
m_column++; |
||||
symbol += character; |
||||
} |
||||
m_index--; |
||||
m_column--; |
||||
|
||||
printf("Pushing -> Number: \"%s\"\t%zu[%zu]\n", symbol.c_str(), m_line, column); |
||||
m_tokens.push_back({ Token::Type::Number, m_line, column, symbol }); |
||||
|
||||
return true; |
||||
} |
||||
|
||||
bool Lexer::getLiteral() |
||||
{ |
||||
size_t index = m_index; |
||||
size_t column = m_column; |
||||
|
||||
std::string symbol = ""; |
||||
|
||||
char character; |
||||
for (;;) { |
||||
character = peek(); |
||||
|
||||
// Literals can only contain lower-case letters
|
||||
if (character < 97 || character > 122) { // a-z
|
||||
break; |
||||
} |
||||
|
||||
m_index++; |
||||
m_column++; |
||||
symbol += character; |
||||
} |
||||
m_index--; |
||||
m_column--; |
||||
|
||||
// Literal name validation
|
||||
if (symbol != "false" && symbol != "null" && symbol != "true") { |
||||
m_index = index; |
||||
m_column = column; |
||||
return false; |
||||
} |
||||
|
||||
printf("Pushing -> Literal: \"%s\"\t%zu[%zu]\n", symbol.c_str(), m_line, column); |
||||
m_tokens.push_back({ Token::Type::Literal, m_line, column, symbol }); |
||||
|
||||
return true; |
||||
} |
||||
|
||||
} // namespace Json
|
@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (C) 2022 Riyyi |
||||
* |
||||
* SPDX-License-Identifier: MIT |
||||
*/ |
||||
|
||||
#ifndef JSON_LEXER_H |
||||
#define JSON_LEXER_H |
||||
|
||||
// The JavaScript Object Notation (JSON) Data Interchange Format
|
||||
// home/rick/code/cpp/manafiles/ https://www.rfc-editor.org/rfc/pdfrfc/rfc8259.txt.pdf
|
||||
|
||||
#include <cstddef> // size_t |
||||
#include <string> |
||||
#include <vector> |
||||
|
||||
namespace Json { |
||||
|
||||
struct Token { |
||||
enum class Type { |
||||
None, |
||||
BraceOpen, // {
|
||||
BraceClose, // }
|
||||
BracketOpen, // [
|
||||
BracketClose, // ]
|
||||
Colon, // :
|
||||
Comma, // ,
|
||||
String, // "foobar"
|
||||
Number, // 123.456
|
||||
Literal, // false/null/true (case sensitive)
|
||||
}; |
||||
|
||||
Type type { Type::None }; |
||||
size_t line { 0 }; |
||||
size_t column { 0 }; |
||||
std::string symbol; |
||||
}; |
||||
|
||||
// Lexical analyzer
|
||||
class Lexer { |
||||
public: |
||||
Lexer(const std::string& input); |
||||
virtual ~Lexer(); |
||||
|
||||
void analyze(); |
||||
|
||||
const std::vector<Token>& tokens() const { return m_tokens; } |
||||
|
||||
private: |
||||
char peek(); |
||||
char peekNext(); |
||||
|
||||
char consume(); |
||||
bool consumeSpecific(char character); |
||||
|
||||
bool getString(); |
||||
bool getNumber(); |
||||
bool getLiteral(); |
||||
|
||||
std::string m_input; |
||||
size_t m_index { 0 }; |
||||
size_t m_column { 1 }; |
||||
size_t m_line { 1 }; |
||||
|
||||
std::vector<Token> m_tokens; |
||||
}; |
||||
|
||||
} // namespace Json
|
||||
|
||||
#endif // JSON_LEXER_H
|
Loading…
Reference in new issue