wip: work on cpp tokenizer implementation

2025-12-21 18:20:47 -05:00 · 2025-12-21 18:20:47 -05:00 · fb1059683f
commit fb1059683f
parent d0714f8664
4 changed files with 80 additions and 36 deletions
--- a/neopb.cpp
+++ b/neopb.cpp
@ -1,3 +1,4 @@
+#include <fstream>
 #include <iostream>
 #include <regex>
 #include <string>
@ -8,43 +9,60 @@

 std::vector<PBToken> tokenize(std::string code);

+std::string readFileContents(std::string fname) {
+    std::ifstream ifs(fname);
+    std::string contents( (std::istreambuf_iterator<char>(ifs)), (std::istreambuf_iterator<char>()) );
+    return contents;
+}
+
 int main(int argc, char* argv[]) {
-    //for (int n = 0; n < argc; n++) {
-    //    std::cout << "arg" << n << ": " << argv[n] << std::endl;
-    //}
+    for (int n = 0; n < argc; n++) {
+        std::cout << "arg" << n << ": " << argv[n] << std::endl;
+    }
+    std::string infile = argc > 1 ? argv[1] : "";
+    std::string code = "";
+    if(infile.length() > 0) {
+        code = readFileContents(infile);
+    }
+    Tokenizer tokenizer = Tokenizer(code);
+    std::cout << "code: " << tokenizer.dump() << std::endl;
+    std::vector<PBToken> tokens = tokenizer.tokenize();
+    for(int i = 0; i < tokens.size(); i++) {
+        std::cout << tokens[i].value << std::endl;
+    }

    return 0;
 }

-std::vector<PBToken> tokenize(std::string code) {
-    const PBToken tokenize_one = [](std::string fragment) {
-        //const std::unordered_map<PBTokenType, std::regex> tokentypes = std::unordered_map();
-        const std::regex re_func("\bfunction\b", std::regex_constants::icase);
-        const std::regex re_sub( "\bsub\b", std::regex_constants::icase);
-        const std::regex re_end( "\bend\b", std::regex_constants::icase);
-        const std::regex re_as("\bas\b", std::regex_constants::icase);
-        const std::regex re_type("\blong\b", std::regex_constants::icase);
-        const std::regex re_identifier("\b[a-zA-Z]+\b");
-        const std::regex re_integer("\b[0-9]+\b");
-        const std::regex re_string("\".*\"");
-        const std::regex re_oparen("\(");
-        const std::regex re_cparen("\)");
-        const std::regex re_comma(",");
-        const std::regex re_quote("'");
-        const std::regex re_equals("=");
-
-        PBTokenType tt = SUB;
-        std::string val = fragment.trim();
-
-
-
-        return { .type = tt, .value = val };
-    };
-    std::vector<PBToken> tokens();
-    while(code.length() > 0) {
-        int split = code.find(' ');
-        std::string fragment = split > 0 ? code.substr(0, split) : code;
-        tokens.push_back(fragment);
-    }
-    return tokens;
-}
+//std::vector<PBToken> tokenize(std::string code) {
+//    const PBToken tokenize_one = [](std::string fragment) {
+//        //const std::unordered_map<PBTokenType, std::regex> tokentypes = std::unordered_map();
+//        const std::regex re_func("\bfunction\b", std::regex_constants::icase);
+//        const std::regex re_sub( "\bsub\b", std::regex_constants::icase);
+//        const std::regex re_end( "\bend\b", std::regex_constants::icase);
+//        const std::regex re_as("\bas\b", std::regex_constants::icase);
+//        const std::regex re_type("\blong\b", std::regex_constants::icase);
+//        const std::regex re_identifier("\b[a-zA-Z]+\b");
+//        const std::regex re_integer("\b[0-9]+\b");
+//        const std::regex re_string("\".*\"");
+//        const std::regex re_oparen("\(");
+//        const std::regex re_cparen("\)");
+//        const std::regex re_comma(",");
+//        const std::regex re_quote("'");
+//        const std::regex re_equals("=");
+//
+//        PBTokenType tt = SUB;
+//        std::string val = fragment.trim();
+//
+//
+//
+//        return { .type = tt, .value = val };
+//    };
+//    std::vector<PBToken> tokens();
+//    while(code.length() > 0) {
+//        int split = code.find(' ');
+//        std::string fragment = split > 0 ? code.substr(0, split) : code;
+//        tokens.push_back(fragment);
+//    }
+//    return tokens;
+//}
--- a/pb2c.rb
+++ b/pb2c.rb
@ -2,6 +2,7 @@

 class Tokenizer
    TOKEN_TYPES = [
+        [:preproc, /#[a-zA-Z]+\b/i],
        [:function, /\bfunction\b/i],
        [:sub, /\bsub\b/i],
        [:end, /\bend\b/i],
@ -25,6 +26,7 @@ class Tokenizer
            until @code.empty?
                tokens << tokenize_one_token
                @code = @code.strip
+                #puts tokens.join("\n")
            end
        rescue RuntimeError => e
            puts tokens.join("\n")
@ -58,6 +60,9 @@ class Parser
        parse_function
    end

+    def parse_preproc
+    end
+
    def parse_function
        consume(:function)
        name = consume(:identifier).value
@ -146,7 +151,8 @@ class Generator
    end
 end

-tokens = Tokenizer.new(File.read("hello.bas")).tokenize
+#tokens = Tokenizer.new(File.read("hello.bas")).tokenize
+tokens = Tokenizer.new(File.read(ARGV[0])).tokenize
 #puts "Tokens:\n"
 #puts tokens.join("\n")
 tree = Parser.new(tokens).parse
--- a/tokenizer.cpp
+++ b/tokenizer.cpp
@ -0,0 +1,19 @@
+#include "tokenizer.hpp"
+
+Tokenizer::Tokenizer(std::string code) {
+    this->code = code;
+}
+
+std::vector<PBToken> Tokenizer::tokenize() {
+    std::vector<PBToken> tokens;
+    tokens.push_back(tokenize_one_token());
+    return tokens;
+}
+
+PBToken Tokenizer::tokenize_one_token() {
+    return { .type = FUNCTION, .value = "Function" };
+}
+
+std::string Tokenizer::dump() {
+    return this->code;
+}
--- a/tokenizer.hpp
+++ b/tokenizer.hpp
@ -34,5 +34,6 @@ public:
    Tokenizer(std::string);
    std::vector<PBToken> tokenize();
    PBToken tokenize_one_token();
+    std::string dump();
 };
 #endif