Browse Source

Util: Add JSON lexical analyzer

master
Riyyi 2 years ago
parent
commit
1c676f9548
  1. 254
      src/util/json/lexer.cpp
  2. 70
      src/util/json/lexer.h

254
src/util/json/lexer.cpp

@ -0,0 +1,254 @@
/*
* Copyright (C) 2022 Riyyi
*
* SPDX-License-Identifier: MIT
*/
#include <cstddef>
#include <string>
#include "util/json/lexer.h"
namespace Json {
Lexer::Lexer(const std::string& input)
: m_input(input)
{
}
Lexer::~Lexer()
{
}
// -----------------------------------------
void Lexer::analyze()
{
printf("---------\n");
printf("Input JSON:\n%s\n", m_input.c_str());
printf("---------\n");
printf("Lexing:\n");
while (m_index < m_input.length()) {
switch (peek()) {
case '{':
printf("Pushing -> BraceOpen: \"{\"\t%zu[%zu]\n", m_line, m_column);
m_tokens.push_back({ Token::Type::BraceOpen, m_line, m_column, "" });
break;
case '}':
printf("Pushing -> BraceClose: \"}\"\t%zu[%zu]\n", m_line, m_column);
m_tokens.push_back({ Token::Type::BraceClose, m_line, m_column, "" });
break;
case '[':
printf("Pushing -> BracketOpen: \"[\"\t%zu[%zu]\n", m_line, m_column);
m_tokens.push_back({ Token::Type::BracketOpen, m_line, m_column, "" });
break;
case ']':
printf("Pushing -> BracketClose: \"]\"\t%zu[%zu]\n", m_line, m_column);
m_tokens.push_back({ Token::Type::BracketClose, m_line, m_column, "" });
break;
case ':':
printf("Pushing -> Colon: \":\"\t%zu[%zu]\n", m_line, m_column);
m_tokens.push_back({ Token::Type::Colon, m_line, m_column, "" });
break;
case ',':
printf("Pushing -> Comma: \",\"\t%zu[%zu]\n", m_line, m_column);
m_tokens.push_back({ Token::Type::Comma, m_line, m_column, "" });
break;
case '"':
if (!getString()) {
// Error!
printf("Invalid JSON!\n");
return;
}
break;
case '-':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
if (!getNumber()) {
// Error!
printf("Invalid JSON!\n");
return;
}
break;
case 'f':
case 'n':
case 't':
if (!getLiteral()) {
// Error!
printf("Invalid JSON!\n");
return;
}
break;
case ' ':
case '\t':
break;
case '\r':
if (peekNext() == '\n') { // CRLF \r\n
break;
}
m_column = 0;
m_line++;
break;
case '\n':
m_column = 0;
m_line++;
break;
default:
// Error!
printf("Invalid JSON!\n");
return;
break;
}
m_index++;
m_column++;
}
}
// -----------------------------------------
char Lexer::peek()
{
return m_input[m_index];
}
char Lexer::peekNext()
{
return m_input[m_index + 1];
}
char Lexer::consume()
{
char character = peek();
m_index++;
m_column++;
return character;
}
bool Lexer::consumeSpecific(char character)
{
if (peek() != character) {
return false;
}
m_index++;
m_column++;
return true;
}
bool Lexer::getString()
{
size_t column = m_column;
std::string symbol = "";
char character = consume();
for (;;) {
character = peek();
// TODO: Escape logic goes here
// ", \, /, b(ackspace), f(orm feed), l(ine feed), c(arriage return), t(ab), u(nicode) \u0021
if (character == '"') {
break;
}
m_index++;
m_column++;
symbol += character;
}
printf("Pushing -> String: \"%s\"\t%zu[%zu]\n", symbol.c_str(), m_line, column);
m_tokens.push_back({ Token::Type::String, m_line, column, symbol });
return true;
}
bool Lexer::getNumber()
{
size_t index = m_index;
size_t column = m_column;
std::string symbol = "";
bool seenDot = false;
char character;
for (;;) {
character = peek();
// FIXME: Break on separator }], rather than valid number symbols to
// get the entire thing, resulting in better error handling
// FIXME: e/E and exponent are also valid characters (?)
if (character != 45 // -
&& character != 46 // .
&& (character < 48 || character > 57)) { // 0-9
break;
}
// Fail if '.' is used more than once
if (seenDot == true && character == 46) { // .
m_index = index;
m_column = column;
return false;
}
if (character == 46) { // .
seenDot = true;
}
m_index++;
m_column++;
symbol += character;
}
m_index--;
m_column--;
printf("Pushing -> Number: \"%s\"\t%zu[%zu]\n", symbol.c_str(), m_line, column);
m_tokens.push_back({ Token::Type::Number, m_line, column, symbol });
return true;
}
bool Lexer::getLiteral()
{
size_t index = m_index;
size_t column = m_column;
std::string symbol = "";
char character;
for (;;) {
character = peek();
// Literals can only contain lower-case letters
if (character < 97 || character > 122) { // a-z
break;
}
m_index++;
m_column++;
symbol += character;
}
m_index--;
m_column--;
// Literal name validation
if (symbol != "false" && symbol != "null" && symbol != "true") {
m_index = index;
m_column = column;
return false;
}
printf("Pushing -> Literal: \"%s\"\t%zu[%zu]\n", symbol.c_str(), m_line, column);
m_tokens.push_back({ Token::Type::Literal, m_line, column, symbol });
return true;
}
} // namespace Json

70
src/util/json/lexer.h

@ -0,0 +1,70 @@
/*
* Copyright (C) 2022 Riyyi
*
* SPDX-License-Identifier: MIT
*/
#ifndef JSON_LEXER_H
#define JSON_LEXER_H
// The JavaScript Object Notation (JSON) Data Interchange Format
// home/rick/code/cpp/manafiles/ https://www.rfc-editor.org/rfc/pdfrfc/rfc8259.txt.pdf
#include <cstddef> // size_t
#include <string>
#include <vector>
namespace Json {
struct Token {
enum class Type {
None,
BraceOpen, // {
BraceClose, // }
BracketOpen, // [
BracketClose, // ]
Colon, // :
Comma, // ,
String, // "foobar"
Number, // 123.456
Literal, // false/null/true (case sensitive)
};
Type type { Type::None };
size_t line { 0 };
size_t column { 0 };
std::string symbol;
};
// Lexical analyzer
class Lexer {
public:
Lexer(const std::string& input);
virtual ~Lexer();
void analyze();
const std::vector<Token>& tokens() const { return m_tokens; }
private:
char peek();
char peekNext();
char consume();
bool consumeSpecific(char character);
bool getString();
bool getNumber();
bool getLiteral();
std::string m_input;
size_t m_index { 0 };
size_t m_column { 1 };
size_t m_line { 1 };
std::vector<Token> m_tokens;
};
} // namespace Json
#endif // JSON_LEXER_H
Loading…
Cancel
Save