64 lines
2.5 KiB
C++
64 lines
2.5 KiB
C++
#include "tokenizer.hpp"
|
|
|
|
#include <iostream>
|
|
|
|
static inline std::string <rim(std::string &s) {
|
|
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int c) {return !std::isspace(c);}));
|
|
return s;
|
|
}
|
|
|
|
Tokenizer::Tokenizer(std::string code) {
|
|
tokentypes = std::vector<std::pair<std::string, std::regex>>();
|
|
tokentypes.push_back( { "PREPROC", std::regex("^(#[a-zA-Z]+\\b)") } );
|
|
tokentypes.push_back( { "FUNCTION", std::regex("^(\\bfunction\\b)", std::regex_constants::icase) } );
|
|
tokentypes.push_back( { "SUB", std::regex("^(\\bsub\\b)", std::regex_constants::icase) } );
|
|
tokentypes.push_back( { "END", std::regex("^(\\bend\\b)", std::regex_constants::icase) } );
|
|
tokentypes.push_back( { "AS", std::regex("^(\\bas\\b)", std::regex_constants::icase) } );
|
|
tokentypes.push_back( { "TYPE", std::regex("^(\\blong\\b)", std::regex_constants::icase) } );
|
|
tokentypes.push_back( { "IDENTIFIER", std::regex("^(\\b[a-zA-Z]+\\b)") } );
|
|
tokentypes.push_back( { "INTEGER", std::regex("^(\\b[0-9]+\\b)") } );
|
|
tokentypes.push_back( { "STRING", std::regex("^(\".*\")") } );
|
|
tokentypes.push_back( { "OPAREN", std::regex("^(\\()") } );
|
|
tokentypes.push_back( { "CPAREN", std::regex("^(\\))") } );
|
|
tokentypes.push_back( { "COMMA", std::regex("^(,)") } );
|
|
tokentypes.push_back( { "QUOTE", std::regex("^(')") } );
|
|
tokentypes.push_back( { "EQUALS", std::regex("^(=)") } );
|
|
this->code = code;
|
|
}
|
|
|
|
std::vector<PBToken> Tokenizer::tokenize() {
|
|
std::vector<PBToken> tokens;
|
|
code = ltrim(code);
|
|
while(code.size() > 0) {
|
|
tokens.push_back(tokenize_one_token());
|
|
code = ltrim(code);
|
|
}
|
|
return tokens;
|
|
}
|
|
|
|
PBToken Tokenizer::tokenize_one_token() {
|
|
PBToken tk;
|
|
for(int tt = 0; tt < TOKEN_TYPE_COUNT; tt++) {
|
|
#ifdef DEBUG
|
|
std::cout << "attempting to match " << std::get<0>(tokentypes[tt]) << std::endl;
|
|
#endif
|
|
std::regex re = std::get<1>(tokentypes[tt]);
|
|
std::smatch sm;
|
|
if(std::regex_search(code, sm, re)) {
|
|
tk.type = (PBTokenType) tt;
|
|
tk.value = sm[0];
|
|
#ifdef DEBUG
|
|
std::cout << "match found" << std::endl;
|
|
std::cout << "tk.type: " << std::get<0>(tokentypes[tk.type]) << std::endl << "tk.value: " << tk.value << std::endl;
|
|
#endif
|
|
code = code.substr(tk.value.length());
|
|
return tk;
|
|
}
|
|
}
|
|
throw std::runtime_error("Couldn't match token: " + code.substr(0, code.find_first_of(" \t\r\n")));
|
|
}
|
|
|
|
std::string Tokenizer::dump() {
|
|
return this->code;
|
|
}
|