You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
252 lines
4.1 KiB
252 lines
4.1 KiB
/* |
|
* Copyright (C) 2023 Riyyi |
|
* |
|
* SPDX-License-Identifier: MIT |
|
*/ |
|
|
|
#include <algorithm> |
|
#include <string> |
|
#include <unordered_set> |
|
|
|
#include "ruc/format/print.h" |
|
#include "ruc/genericlexer.h" |
|
|
|
#include "lexer.h" |
|
|
|
namespace blaze { |
|
|
|
Lexer::Lexer(std::string_view input) |
|
: ruc::GenericLexer(input) |
|
{ |
|
} |
|
|
|
Lexer::~Lexer() |
|
{ |
|
} |
|
|
|
// ----------------------------------------- |
|
|
|
void Lexer::tokenize() |
|
{ |
|
if (m_tokens.size() != 0) { |
|
return; |
|
} |
|
|
|
while (m_index < m_input.length()) { |
|
switch (peek()) { |
|
case '~': // ~@ or ~ |
|
consumeSpliceUnquoteOrUnquote(); |
|
break; |
|
case '[': |
|
m_tokens.push_back({ Token::Type::ParenOpen, m_line, m_column, "[" }); |
|
break; |
|
case ']': |
|
m_tokens.push_back({ Token::Type::ParenClose, m_line, m_column, "]" }); |
|
break; |
|
case '{': |
|
m_tokens.push_back({ Token::Type::BraceOpen, m_line, m_column, "{" }); |
|
break; |
|
case '}': |
|
m_tokens.push_back({ Token::Type::BraceClose, m_line, m_column, "}" }); |
|
break; |
|
case '(': |
|
m_tokens.push_back({ Token::Type::ParenOpen, m_line, m_column, "(" }); |
|
break; |
|
case ')': |
|
m_tokens.push_back({ Token::Type::ParenClose, m_line, m_column, ")" }); |
|
break; |
|
case '\'': |
|
m_tokens.push_back({ Token::Type::Quote, m_line, m_column, "'" }); |
|
break; |
|
case '`': |
|
m_tokens.push_back({ Token::Type::Backtick, m_line, m_column, "`" }); |
|
break; |
|
case '^': |
|
m_tokens.push_back({ Token::Type::Caret, m_line, m_column, "^" }); |
|
break; |
|
case '@': |
|
m_tokens.push_back({ Token::Type::At, m_line, m_column, "@" }); |
|
break; |
|
case '"': |
|
if (!consumeString()) { |
|
return; |
|
} |
|
break; |
|
case ';': |
|
consumeComment(); |
|
break; |
|
case ' ': |
|
case '\t': |
|
case ',': |
|
break; |
|
case '\r': |
|
if (peek(1) == '\n') { // CRLF \r\n |
|
break; |
|
} |
|
m_column = -1; |
|
m_line++; |
|
break; |
|
case '\n': |
|
m_column = -1; |
|
m_line++; |
|
break; |
|
default: |
|
consumeValue(); |
|
break; |
|
} |
|
|
|
ignore(); |
|
m_column++; |
|
} |
|
} |
|
|
|
bool Lexer::consumeSpliceUnquoteOrUnquote() |
|
{ |
|
size_t column = m_column; |
|
|
|
ignore(); // ~ |
|
if (peek() == '@') { |
|
m_tokens.push_back({ Token::Type::Special, m_line, column, "~@" }); |
|
} |
|
else { |
|
m_tokens.push_back({ Token::Type::Tilde, m_line, column, "~" }); |
|
} |
|
|
|
return true; |
|
} |
|
|
|
bool Lexer::consumeString() |
|
{ |
|
size_t column = m_column; |
|
std::string text = "\""; |
|
|
|
static std::unordered_set<char> exit = { |
|
'"', |
|
'\r', |
|
'\n', |
|
'\0', |
|
}; |
|
|
|
bool escape = false; |
|
char character = consume(); |
|
for (;;) { |
|
character = peek(); |
|
|
|
if (!escape && character == '\\') { |
|
text += '\\'; |
|
ignore(); |
|
escape = true; |
|
continue; |
|
} |
|
|
|
if (!escape && exit.find(character) != exit.end()) { |
|
break; |
|
} |
|
|
|
text += character; |
|
ignore(); |
|
|
|
escape = false; |
|
} |
|
|
|
if (character == '"') { |
|
text += character; |
|
} |
|
|
|
print("lex text '{}'\n", text); |
|
|
|
m_tokens.push_back({ Token::Type::String, m_line, column, text }); |
|
|
|
return true; |
|
} |
|
|
|
bool Lexer::consumeComment() |
|
{ |
|
size_t column = m_column; |
|
std::string comment = ""; |
|
|
|
ignore(); // ; |
|
|
|
static std::unordered_set<char> exit = { |
|
'\r', |
|
'\n', |
|
'\0', |
|
}; |
|
|
|
char character = 0; |
|
for (;;) { |
|
character = peek(); |
|
|
|
if (exit.find(character) != exit.end()) { |
|
break; |
|
} |
|
|
|
comment += character; |
|
ignore(); |
|
} |
|
|
|
// Trim comment |
|
comment.erase(comment.begin(), |
|
std::find_if(comment.begin(), comment.end(), [](char c) { return !std::isspace(c); })); |
|
comment.erase(std::find_if(comment.rbegin(), comment.rend(), [](char c) { return !std::isspace(c); }).base(), |
|
comment.end()); |
|
|
|
m_tokens.push_back({ Token::Type::Comment, m_line, column, comment }); |
|
|
|
return true; |
|
} |
|
|
|
bool Lexer::consumeValue() |
|
{ |
|
size_t column = m_column; |
|
std::string value = ""; |
|
|
|
static std::unordered_set<char> exit = { |
|
'[', |
|
']', |
|
'{', |
|
'}', |
|
'(', |
|
')', |
|
'\'', |
|
'`', |
|
',', |
|
'"', |
|
';', |
|
' ', |
|
'\t', |
|
'\r', |
|
'\n', |
|
'\0', |
|
}; |
|
|
|
char character = 0; |
|
for (;;) { |
|
character = peek(); |
|
|
|
if (exit.find(character) != exit.end()) { |
|
break; |
|
} |
|
|
|
value += character; |
|
ignore(); |
|
} |
|
|
|
m_tokens.push_back({ Token::Type::Value, m_line, column, value }); |
|
|
|
retreat(); |
|
|
|
return true; |
|
} |
|
|
|
void Lexer::dump() const |
|
{ |
|
print("tokens: {}\n", m_tokens.size()); |
|
print("\""); |
|
for (auto& token : m_tokens) { |
|
print("{}", token.symbol); |
|
} |
|
print("\"\n"); |
|
} |
|
|
|
} // namespace blaze
|
|
|