From 1e7ea750f6d645cfe02620025a1b1c2c6919e66c Mon Sep 17 00:00:00 2001
From: hiro <hiro@neetdev.com>
Date: Mon, 22 Dec 2025 18:39:09 -0500
Subject: [PATCH] wip: implement tokenizer in c++, ignore binaries

---
 .gitignore    |  2 ++
 neopb.cpp     | 41 +++++++----------------------------------
 tokenizer.cpp | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
 tokenizer.hpp |  7 ++++++-
 4 files changed, 61 insertions(+), 37 deletions(-)
diff --git a/.gitignore b/.gitignore
index 3e0f233..f82575c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
 a.out
+npb
+hello
 
diff --git a/neopb.cpp b/neopb.cpp
index 38fb0d7..d205561 100644
--- a/neopb.cpp
+++ b/neopb.cpp
@@ -1,8 +1,6 @@
 #include <fstream>
 #include <iostream>
-#include <regex>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 #include "tokenizer.hpp"
@@ -16,17 +14,24 @@ std::string readFileContents(std::string fname) {
 }
 
 int main(int argc, char* argv[]) {
+#ifdef DEBUG
     for (int n = 0; n < argc; n++) {
         std::cout << "arg" << n << ": " << argv[n] << std::endl;
     }
+#endif
     std::string infile = argc > 1 ? argv[1] : "";
     std::string code = "";
     if(infile.length() > 0) {
         code = readFileContents(infile);
     }
     Tokenizer tokenizer = Tokenizer(code);
+
+#ifdef DEBUG
     std::cout << "code: " << tokenizer.dump() << std::endl;
+#endif
+
     std::vector<PBToken> tokens = tokenizer.tokenize();
+
     for(int i = 0; i < tokens.size(); i++) {
         std::cout << tokens[i].value << std::endl;
     }
@@ -34,35 +39,3 @@ int main(int argc, char* argv[]) {
     return 0;
 }
 
-//std::vector<PBToken> tokenize(std::string code) {
-//    const PBToken tokenize_one = [](std::string fragment) {
-//        //const std::unordered_map<PBTokenType, std::regex> tokentypes = std::unordered_map();
-//        const std::regex re_func("\bfunction\b", std::regex_constants::icase);
-//        const std::regex re_sub( "\bsub\b", std::regex_constants::icase);
-//        const std::regex re_end( "\bend\b", std::regex_constants::icase);
-//        const std::regex re_as("\bas\b", std::regex_constants::icase);
-//        const std::regex re_type("\blong\b", std::regex_constants::icase);
-//        const std::regex re_identifier("\b[a-zA-Z]+\b");
-//        const std::regex re_integer("\b[0-9]+\b");
-//        const std::regex re_string("\".*\"");
-//        const std::regex re_oparen("\(");
-//        const std::regex re_cparen("\)");
-//        const std::regex re_comma(",");
-//        const std::regex re_quote("'");
-//        const std::regex re_equals("=");
-//
-//        PBTokenType tt = SUB;
-//        std::string val = fragment.trim();
-//
-//
-//
-//        return { .type = tt, .value = val };
-//    };
-//    std::vector<PBToken> tokens();
-//    while(code.length() > 0) {
-//        int split = code.find(' ');
-//        std::string fragment = split > 0 ? code.substr(0, split) : code;
-//        tokens.push_back(fragment);
-//    }
-//    return tokens;
-//}
diff --git a/tokenizer.cpp b/tokenizer.cpp
index 8a3938d..bc1cb02 100644
--- a/tokenizer.cpp
+++ b/tokenizer.cpp
@@ -1,17 +1,61 @@
 #include "tokenizer.hpp"
 
+#include <iostream>
+
+static inline std::string &ltrim(std::string &s) {
+    s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int c) {return !std::isspace(c);}));
+    return s;
+}
+
 Tokenizer::Tokenizer(std::string code) {
+    tokentypes = std::vector<std::pair<std::string, std::regex>>();
+    tokentypes.push_back( { "PREPROC", std::regex("^(#[a-zA-Z]+\\b)") } );
+    tokentypes.push_back( { "FUNCTION", std::regex("^(\\bfunction\\b)", std::regex_constants::icase) } );
+    tokentypes.push_back( { "SUB", std::regex("^(\\bsub\\b)", std::regex_constants::icase) } );
+    tokentypes.push_back( { "END", std::regex("^(\\bend\\b)", std::regex_constants::icase) } );
+    tokentypes.push_back( { "AS", std::regex("^(\\bas\\b)", std::regex_constants::icase) } );
+    tokentypes.push_back( { "TYPE", std::regex("^(\\blong\\b)", std::regex_constants::icase) } );
+    tokentypes.push_back( { "IDENTIFIER", std::regex("^(\\b[a-zA-Z]+\\b)") } );
+    tokentypes.push_back( { "INTEGER", std::regex("^(\\b[0-9]+\\b)") } );
+    tokentypes.push_back( { "STRING", std::regex("^(\".*\")") } );
+    tokentypes.push_back( { "OPAREN", std::regex("^(\\()") } );
+    tokentypes.push_back( { "CPAREN", std::regex("^(\\))") } );
+    tokentypes.push_back( { "COMMA", std::regex("^(,)") } );
+    tokentypes.push_back( { "QUOTE", std::regex("^(')") } );
+    tokentypes.push_back( { "EQUALS", std::regex("^(=)") } );
     this->code = code;
 }
 
 std::vector<PBToken> Tokenizer::tokenize() {
     std::vector<PBToken> tokens;
-    tokens.push_back(tokenize_one_token());
+    code = ltrim(code);
+    while(code.size() > 0) {
+        tokens.push_back(tokenize_one_token());
+        code = ltrim(code);
+    }
     return tokens;
 }
 
 PBToken Tokenizer::tokenize_one_token() {
-    return { .type = FUNCTION, .value = "Function" };
+    PBToken tk;
+    for(int tt = 0; tt < TOKEN_TYPE_COUNT; tt++) {
+#ifdef DEBUG
+        std::cout << "attempting to match " << std::get<0>(tokentypes[tt]) << std::endl;
+#endif
+        std::regex re = std::get<1>(tokentypes[tt]);
+        std::smatch sm;
+        if(std::regex_search(code, sm, re)) {
+            tk.type = (PBTokenType) tt;
+            tk.value = sm[0];
+#ifdef DEBUG
+            std::cout << "match found" << std::endl;
+            std::cout << "tk.type: " << std::get<0>(tokentypes[tk.type]) << std::endl << "tk.value: " << tk.value << std::endl;
+#endif
+            code = code.substr(tk.value.length());
+            return tk;
+        }
+    }
+    throw std::runtime_error("Couldn't match token: " + code.substr(0, code.find_first_of(" \t\r\n")));
 }
 
 std::string Tokenizer::dump() {
diff --git a/tokenizer.hpp b/tokenizer.hpp
index a144155..d6e129b 100644
--- a/tokenizer.hpp
+++ b/tokenizer.hpp
@@ -4,8 +4,11 @@
 
 #include <regex>
 #include <unordered_map>
+#include <utility>
+#include <vector>
 
 typedef enum {
+    PREPROC,
     FUNCTION,
     SUB,
     END,
@@ -28,7 +31,9 @@ typedef struct {
 } PBToken;
 
 class Tokenizer {
-    std::unordered_map<PBTokenType, std::regex> tokentypes;
+    //std::unordered_map<PBTokenType, std::regex> tokentypes;
+    std::vector<std::pair<std::string, std::regex>> tokentypes;
+    //std::vector<std::regex> tokentypes;
     std::string code;
 public:
     Tokenizer(std::string);