split out tokenizer class

base c++ work
remove c stub
2025-11-18 16:24:59 -06:00 · 2025-11-18 09:12:59 -06:00 · 2025-11-17 16:22:20 -06:00 · 2025-03-26 00:05:18 -05:00 · 2025-03-26 00:04:16 -05:00
6 changed files with 255 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+a.out
+
--- a/hello.bas
+++ b/hello.bas
@ -0,0 +1,3 @@
+Function PBMain() as Long
+    PRINT "Hello, world!"
+End Function
--- a/neopb.cpp
+++ b/neopb.cpp
@ -0,0 +1,50 @@
+#include <iostream>
+#include <regex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tokenizer.hpp"
+
+std::vector<PBToken> tokenize(std::string code);
+
+int main(int argc, char* argv[]) {
+    //for (int n = 0; n < argc; n++) {
+    //    std::cout << "arg" << n << ": " << argv[n] << std::endl;
+    //}
+
+    return 0;
+}
+
+std::vector<PBToken> tokenize(std::string code) {
+    const PBToken tokenize_one = [](std::string fragment) {
+        //const std::unordered_map<PBTokenType, std::regex> tokentypes = std::unordered_map();
+        const std::regex re_func("\bfunction\b", std::regex_constants::icase);
+        const std::regex re_sub( "\bsub\b", std::regex_constants::icase);
+        const std::regex re_end( "\bend\b", std::regex_constants::icase);
+        const std::regex re_as("\bas\b", std::regex_constants::icase);
+        const std::regex re_type("\blong\b", std::regex_constants::icase);
+        const std::regex re_identifier("\b[a-zA-Z]+\b");
+        const std::regex re_integer("\b[0-9]+\b");
+        const std::regex re_string("\".*\"");
+        const std::regex re_oparen("\(");
+        const std::regex re_cparen("\)");
+        const std::regex re_comma(",");
+        const std::regex re_quote("'");
+        const std::regex re_equals("=");
+
+        PBTokenType tt = SUB;
+        std::string val = fragment.trim();
+
+
+
+        return { .type = tt, .value = val };
+    };
+    std::vector<PBToken> tokens();
+    while(code.length() > 0) {
+        int split = code.find(' ');
+        std::string fragment = split > 0 ? code.substr(0, split) : code;
+        tokens.push_back(fragment);
+    }
+    return tokens;
+}
--- a/pb2c.rb
+++ b/pb2c.rb
@ -0,0 +1,162 @@
+#!/usr/bin/ruby
+
+class Tokenizer
+    TOKEN_TYPES = [
+        [:function, /\bfunction\b/i],
+        [:sub, /\bsub\b/i],
+        [:end, /\bend\b/i],
+        [:as, /\bas\b/i],
+        [:typename, /\blong\b/i],
+        [:identifier, /\b[a-zA-Z]+\b/],
+        [:integer, /\b[0-9]+\b/],
+        [:string, /".*"/],
+        [:oparen, /\(/],
+        [:cparen, /\)/],
+        [:comma, /,/],
+        [:quote, /'/],
+    ]
+    def initialize(code)
+        @code = code
+    end
+
+    def tokenize
+        tokens = []
+        begin
+            until @code.empty?
+                tokens << tokenize_one_token
+                @code = @code.strip
+            end
+        rescue RuntimeError => e
+            puts tokens.join("\n")
+            raise
+        end
+        tokens
+    end
+
+    def tokenize_one_token
+        TOKEN_TYPES.each do |type, re|
+            re = /\A(#{re})/
+            if @code =~ re
+                value = $1
+                @code = @code[value.length..-1]
+                return Token.new(type, value)
+            end
+        end
+        raise RuntimeError.new(
+            "Couldn't match token on #{@code.inspect}")
+    end
+end
+
+Token = Struct.new(:type, :value)
+
+class Parser
+    def initialize(tokens)
+        @tokens = tokens
+    end
+
+    def parse
+        parse_function
+    end
+
+    def parse_function
+        consume(:function)
+        name = consume(:identifier).value
+        arg_names = parse_arg_names
+        consume(:as)
+        rtype = consume(:typename).value
+        body = parse_expr
+        consume(:end)
+        consume(:function)
+        FunctionNode.new(name, rtype, arg_names, body)
+    end
+
+    def parse_arg_names
+        arg_names = []
+        consume(:oparen)
+        if peek(:identifier)
+            arg_names << consume(:identifier).value
+            while peek(:comma)
+                consume(:comma)
+                arg_names << consume(:identifier).value
+            end
+        end
+        consume(:cparen)
+        arg_names
+    end
+
+    def parse_expr
+        if peek(:integer)
+            parse_integer
+        elsif peek(:string)
+            parse_string
+        elsif peek(:identifier) && peek(:oparen, 1)
+            parse_call
+        elsif peek(:identifier) && peek(:string, 1)
+            parse_stmt
+        else
+            parse_var_ref
+        end
+    end
+
+    def parse_stmt
+        name = consume(:identifier).value
+        arg_exprs = consume(:string).value
+        CallNode.new(name, arg_exprs)
+    end
+
+    def peek(expected_type, offset=0)
+        @tokens.fetch(offset).type == expected_type
+    end
+
+    def consume(expected_type)
+        token = @tokens.shift
+        if token.type == expected_type
+            token
+        else
+            raise RuntimeError.new(
+                "Expected token type #{expected_type.inspect} but got #{token.type.inspect}")
+        end
+    end
+end
+
+FunctionNode = Struct.new(:name, :type, :arg_names, :body)
+StringNode = Struct.new(:value)
+CallNode = Struct.new(:name, :arg_exprs)
+
+class Generator
+    def generate(node)
+        case node
+        when FunctionNode
+            "%s %s(%s) { return %s ; }" % [
+                node.type.downcase,
+                node.name,
+                node.arg_names.join(','),
+                generate(node.body),
+            ]
+        when CallNode
+            "%s(%s)" % [
+                node.name,
+                node.arg_exprs
+            ]
+        when StringNode
+            node.value
+        else
+            raise RuntimeError.new("Unexpected node type: #{node.class}")
+        end
+    end
+end
+
+tokens = Tokenizer.new(File.read("hello.bas")).tokenize
+#puts "Tokens:\n"
+#puts tokens.join("\n")
+tree = Parser.new(tokens).parse
+#puts "\nAST:\n"
+#puts tree
+RUNTIME = "#include <stdio.h>\n#define PRINT(a) printf(a)\n"
+CMAIN   = "int main(void) { PBMain(); return 0; }"
+generated = Generator.new.generate(tree)
+#puts "\nGenerated:\n"
+#puts generated
+#puts "\nGenerated with preamble/postamble:\n"
+puts [RUNTIME, generated, CMAIN].join("\n")
+
--- a/tokenizer.cpp
+++ b/tokenizer.cpp
--- a/tokenizer.hpp
+++ b/tokenizer.hpp
@ -0,0 +1,38 @@
+#pragma once
+#ifndef TOKENIZER_HPP
+#define TOKENIZER_HPP
+
+#include <regex>
+#include <unordered_map>
+
+typedef enum {
+    FUNCTION,
+    SUB,
+    END,
+    AS,
+    TYPE,
+    IDENTIFIER,
+    INTEGER,
+    STRING,
+    OPAREN,
+    CPAREN,
+    COMMA,
+    QUOTE,
+    EQUALS,
+    TOKEN_TYPE_COUNT
+} PBTokenType;
+
+typedef struct {
+    PBTokenType type;
+    std::string value;
+} PBToken;
+
+class Tokenizer {
+    std::unordered_map<PBTokenType, std::regex> tokentypes;
+    std::string code;
+public:
+    Tokenizer(std::string);
+    std::vector<PBToken> tokenize();
+    PBToken tokenize_one_token();
+};
+#endif
Author	SHA1	Message	Date
hiro	d0714f8664	split out tokenizer class	2025-11-18 16:24:59 -06:00
Andrew Montenigro	68ea89ece6	base c++ work	2025-11-18 09:12:59 -06:00
Andrew Montenigro	23e7faf186	remove c stub	2025-11-17 16:22:20 -06:00
Andrew	13ce1a72c1	chore: gitignore a.out	2025-03-26 00:05:18 -05:00
Andrew	acbdf1de5f	first pass transpiler implementation - pb2c.rb - Modified version of Gary Bernhardt's example compiler from scratch - Transpiles absolute barebones PowerBasic to C Compile with `./pb2c.rb \| gcc -xc -`	2025-03-26 00:04:16 -05:00