From 1e7ea750f6d645cfe02620025a1b1c2c6919e66c Mon Sep 17 00:00:00 2001 From: hiro Date: Mon, 22 Dec 2025 18:39:09 -0500 Subject: [PATCH] wip: implement tokenizer in c++, ignore binaries --- .gitignore | 2 ++ neopb.cpp | 41 +++++++---------------------------------- tokenizer.cpp | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- tokenizer.hpp | 7 ++++++- 4 files changed, 61 insertions(+), 37 deletions(-) diff --git a/.gitignore b/.gitignore index 3e0f233..f82575c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ a.out +npb +hello diff --git a/neopb.cpp b/neopb.cpp index 38fb0d7..d205561 100644 --- a/neopb.cpp +++ b/neopb.cpp @@ -1,8 +1,6 @@ #include #include -#include #include -#include #include #include "tokenizer.hpp" @@ -16,17 +14,24 @@ std::string readFileContents(std::string fname) { } int main(int argc, char* argv[]) { +#ifdef DEBUG for (int n = 0; n < argc; n++) { std::cout << "arg" << n << ": " << argv[n] << std::endl; } +#endif std::string infile = argc > 1 ? argv[1] : ""; std::string code = ""; if(infile.length() > 0) { code = readFileContents(infile); } Tokenizer tokenizer = Tokenizer(code); + +#ifdef DEBUG std::cout << "code: " << tokenizer.dump() << std::endl; +#endif + std::vector tokens = tokenizer.tokenize(); + for(int i = 0; i < tokens.size(); i++) { std::cout << tokens[i].value << std::endl; } @@ -34,35 +39,3 @@ int main(int argc, char* argv[]) { return 0; } -//std::vector tokenize(std::string code) { -// const PBToken tokenize_one = [](std::string fragment) { -// //const std::unordered_map tokentypes = std::unordered_map(); -// const std::regex re_func("\bfunction\b", std::regex_constants::icase); -// const std::regex re_sub( "\bsub\b", std::regex_constants::icase); -// const std::regex re_end( "\bend\b", std::regex_constants::icase); -// const std::regex re_as("\bas\b", std::regex_constants::icase); -// const std::regex re_type("\blong\b", std::regex_constants::icase); -// const std::regex re_identifier("\b[a-zA-Z]+\b"); -// const std::regex re_integer("\b[0-9]+\b"); -// const std::regex re_string("\".*\""); -// const std::regex re_oparen("\("); -// const std::regex re_cparen("\)"); -// const std::regex re_comma(","); -// const std::regex re_quote("'"); -// const std::regex re_equals("="); -// -// PBTokenType tt = SUB; -// std::string val = fragment.trim(); -// -// -// -// return { .type = tt, .value = val }; -// }; -// std::vector tokens(); -// while(code.length() > 0) { -// int split = code.find(' '); -// std::string fragment = split > 0 ? code.substr(0, split) : code; -// tokens.push_back(fragment); -// } -// return tokens; -//} diff --git a/tokenizer.cpp b/tokenizer.cpp index 8a3938d..bc1cb02 100644 --- a/tokenizer.cpp +++ b/tokenizer.cpp @@ -1,17 +1,61 @@ #include "tokenizer.hpp" +#include + +static inline std::string <rim(std::string &s) { + s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int c) {return !std::isspace(c);})); + return s; +} + Tokenizer::Tokenizer(std::string code) { + tokentypes = std::vector>(); + tokentypes.push_back( { "PREPROC", std::regex("^(#[a-zA-Z]+\\b)") } ); + tokentypes.push_back( { "FUNCTION", std::regex("^(\\bfunction\\b)", std::regex_constants::icase) } ); + tokentypes.push_back( { "SUB", std::regex("^(\\bsub\\b)", std::regex_constants::icase) } ); + tokentypes.push_back( { "END", std::regex("^(\\bend\\b)", std::regex_constants::icase) } ); + tokentypes.push_back( { "AS", std::regex("^(\\bas\\b)", std::regex_constants::icase) } ); + tokentypes.push_back( { "TYPE", std::regex("^(\\blong\\b)", std::regex_constants::icase) } ); + tokentypes.push_back( { "IDENTIFIER", std::regex("^(\\b[a-zA-Z]+\\b)") } ); + tokentypes.push_back( { "INTEGER", std::regex("^(\\b[0-9]+\\b)") } ); + tokentypes.push_back( { "STRING", std::regex("^(\".*\")") } ); + tokentypes.push_back( { "OPAREN", std::regex("^(\\()") } ); + tokentypes.push_back( { "CPAREN", std::regex("^(\\))") } ); + tokentypes.push_back( { "COMMA", std::regex("^(,)") } ); + tokentypes.push_back( { "QUOTE", std::regex("^(')") } ); + tokentypes.push_back( { "EQUALS", std::regex("^(=)") } ); this->code = code; } std::vector Tokenizer::tokenize() { std::vector tokens; - tokens.push_back(tokenize_one_token()); + code = ltrim(code); + while(code.size() > 0) { + tokens.push_back(tokenize_one_token()); + code = ltrim(code); + } return tokens; } PBToken Tokenizer::tokenize_one_token() { - return { .type = FUNCTION, .value = "Function" }; + PBToken tk; + for(int tt = 0; tt < TOKEN_TYPE_COUNT; tt++) { +#ifdef DEBUG + std::cout << "attempting to match " << std::get<0>(tokentypes[tt]) << std::endl; +#endif + std::regex re = std::get<1>(tokentypes[tt]); + std::smatch sm; + if(std::regex_search(code, sm, re)) { + tk.type = (PBTokenType) tt; + tk.value = sm[0]; +#ifdef DEBUG + std::cout << "match found" << std::endl; + std::cout << "tk.type: " << std::get<0>(tokentypes[tk.type]) << std::endl << "tk.value: " << tk.value << std::endl; +#endif + code = code.substr(tk.value.length()); + return tk; + } + } + throw std::runtime_error("Couldn't match token: " + code.substr(0, code.find_first_of(" \t\r\n"))); } std::string Tokenizer::dump() { diff --git a/tokenizer.hpp b/tokenizer.hpp index a144155..d6e129b 100644 --- a/tokenizer.hpp +++ b/tokenizer.hpp @@ -4,8 +4,11 @@ #include #include +#include +#include typedef enum { + PREPROC, FUNCTION, SUB, END, @@ -28,7 +31,9 @@ typedef struct { } PBToken; class Tokenizer { - std::unordered_map tokentypes; + //std::unordered_map tokentypes; + std::vector> tokentypes; + //std::vector tokentypes; std::string code; public: Tokenizer(std::string);