feat: add build script

wip: add parser stub, cleanup scratch comments in tokenizer
wip: implement tokenizer in c++, ignore binaries
2025-12-24 12:40:07 -05:00 · 2025-12-22 21:41:23 -05:00 · 2025-12-22 18:39:09 -05:00 · 2025-12-21 18:20:47 -05:00 · 2025-11-18 16:24:59 -06:00 · 2025-11-18 09:12:59 -06:00
8 changed files with 356 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
 a.out
 npb
 hello
--- a/build.sh
+++ b/build.sh
@ -0,0 +1,13 @@
 #!/usr/bin/env -S bash
 DEFINES=""
 MODE="release"
 while getopts "d" flag; do
    case "$flag" in
        d) DEFINES="$DEFINES -DDEBUG"; MODE="debug" ;;
    esac
 done
 echo building neopb as "$MODE"...
 g++ -o npb $DEFINES neopb.cpp tokenizer.cpp
--- a/hello.bas
+++ b/hello.bas
@ -0,0 +1,3 @@
 Function PBMain() as Long
    PRINT "Hello, world!"
 End Function
--- a/neopb.cpp
+++ b/neopb.cpp
@ -0,0 +1,41 @@
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
 #include "tokenizer.hpp"
 std::vector<PBToken> tokenize(std::string code);
 std::string readFileContents(std::string fname) {
    std::ifstream ifs(fname);
    std::string contents( (std::istreambuf_iterator<char>(ifs)), (std::istreambuf_iterator<char>()) );
    return contents;
 }
 int main(int argc, char* argv[]) {
 #ifdef DEBUG
    for (int n = 0; n < argc; n++) {
        std::cout << "arg" << n << ": " << argv[n] << std::endl;
    }
 #endif
    std::string infile = argc > 1 ? argv[1] : "";
    std::string code = "";
    if(infile.length() > 0) {
        code = readFileContents(infile);
    }
    Tokenizer tokenizer = Tokenizer(code);
 #ifdef DEBUG
    std::cout << "code: " << tokenizer.dump() << std::endl;
 #endif
    std::vector<PBToken> tokens = tokenizer.tokenize();
    for(int i = 0; i < tokens.size(); i++) {
        std::cout << tokens[i].value << std::endl;
    }
    return 0;
 }
--- a/parser.hpp
+++ b/parser.hpp
@ -0,0 +1,22 @@
 #pragma once
 #ifndef PARSER_HPP
 #define PARSER_HPP
 class Parser {
    std::vector<PBToken> tokens;
 public:
    Parser(std::vector);
    std::vector parse();
 };
 #endif
--- a/pb2c.rb
+++ b/pb2c.rb
@ -0,0 +1,168 @@
 #!/usr/bin/ruby
 class Tokenizer
    TOKEN_TYPES = [
        [:preproc, /#[a-zA-Z]+\b/i],
        [:function, /\bfunction\b/i],
        [:sub, /\bsub\b/i],
        [:end, /\bend\b/i],
        [:as, /\bas\b/i],
        [:typename, /\blong\b/i],
        [:identifier, /\b[a-zA-Z]+\b/],
        [:integer, /\b[0-9]+\b/],
        [:string, /".*"/],
        [:oparen, /\(/],
        [:cparen, /\)/],
        [:comma, /,/],
        [:quote, /'/],
    ]
    def initialize(code)
        @code = code
    end
    def tokenize
        tokens = []
        begin
            until @code.empty?
                tokens << tokenize_one_token
                @code = @code.strip
                #puts tokens.join("\n")
            end
        rescue RuntimeError => e
            puts tokens.join("\n")
            raise
        end
        tokens
    end
    def tokenize_one_token
        TOKEN_TYPES.each do |type, re|
            re = /\A(#{re})/
            if @code =~ re
                value = $1
                @code = @code[value.length..-1]
                return Token.new(type, value)
            end
        end
        raise RuntimeError.new(
            "Couldn't match token on #{@code.inspect}")
    end
 end
 Token = Struct.new(:type, :value)
 class Parser
    def initialize(tokens)
        @tokens = tokens
    end
    def parse
        parse_function
    end
    def parse_preproc
    end
    def parse_function
        consume(:function)
        name = consume(:identifier).value
        arg_names = parse_arg_names
        consume(:as)
        rtype = consume(:typename).value
        body = parse_expr
        consume(:end)
        consume(:function)
        FunctionNode.new(name, rtype, arg_names, body)
    end
    def parse_arg_names
        arg_names = []
        consume(:oparen)
        if peek(:identifier)
            arg_names << consume(:identifier).value
            while peek(:comma)
                consume(:comma)
                arg_names << consume(:identifier).value
            end
        end
        consume(:cparen)
        arg_names
    end
    def parse_expr
        if peek(:integer)
            parse_integer
        elsif peek(:string)
            parse_string
        elsif peek(:identifier) && peek(:oparen, 1)
            parse_call
        elsif peek(:identifier) && peek(:string, 1)
            parse_stmt
        else
            parse_var_ref
        end
    end
    def parse_stmt
        name = consume(:identifier).value
        arg_exprs = consume(:string).value
        CallNode.new(name, arg_exprs)
    end
    def peek(expected_type, offset=0)
        @tokens.fetch(offset).type == expected_type
    end
    def consume(expected_type)
        token = @tokens.shift
        if token.type == expected_type
            token
        else
            raise RuntimeError.new(
                "Expected token type #{expected_type.inspect} but got #{token.type.inspect}")
        end
    end
 end
 FunctionNode = Struct.new(:name, :type, :arg_names, :body)
 StringNode = Struct.new(:value)
 CallNode = Struct.new(:name, :arg_exprs)
 class Generator
    def generate(node)
        case node
        when FunctionNode
            "%s %s(%s) { return %s ; }" % [
                node.type.downcase,
                node.name,
                node.arg_names.join(','),
                generate(node.body),
            ]
        when CallNode
            "%s(%s)" % [
                node.name,
                node.arg_exprs
            ]
        when StringNode
            node.value
        else
            raise RuntimeError.new("Unexpected node type: #{node.class}")
        end
    end
 end
 #tokens = Tokenizer.new(File.read("hello.bas")).tokenize
 tokens = Tokenizer.new(File.read(ARGV[0])).tokenize
 #puts "Tokens:\n"
 #puts tokens.join("\n")
 tree = Parser.new(tokens).parse
 #puts "\nAST:\n"
 #puts tree
 RUNTIME = "#include <stdio.h>\n#define PRINT(a) printf(a)\n"
 CMAIN   = "int main(void) { PBMain(); return 0; }"
 generated = Generator.new.generate(tree)
 #puts "\nGenerated:\n"
 #puts generated
 #puts "\nGenerated with preamble/postamble:\n"
 puts [RUNTIME, generated, CMAIN].join("\n")
--- a/tokenizer.cpp
+++ b/tokenizer.cpp
@ -0,0 +1,63 @@
 #include "tokenizer.hpp"
 #include <iostream>
 static inline std::string &ltrim(std::string &s) {
    s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int c) {return !std::isspace(c);}));
    return s;
 }
 Tokenizer::Tokenizer(std::string code) {
    tokentypes = std::vector<std::pair<std::string, std::regex>>();
    tokentypes.push_back( { "PREPROC", std::regex("^(#[a-zA-Z]+\\b)") } );
    tokentypes.push_back( { "FUNCTION", std::regex("^(\\bfunction\\b)", std::regex_constants::icase) } );
    tokentypes.push_back( { "SUB", std::regex("^(\\bsub\\b)", std::regex_constants::icase) } );
    tokentypes.push_back( { "END", std::regex("^(\\bend\\b)", std::regex_constants::icase) } );
    tokentypes.push_back( { "AS", std::regex("^(\\bas\\b)", std::regex_constants::icase) } );
    tokentypes.push_back( { "TYPE", std::regex("^(\\blong\\b)", std::regex_constants::icase) } );
    tokentypes.push_back( { "IDENTIFIER", std::regex("^(\\b[a-zA-Z]+\\b)") } );
    tokentypes.push_back( { "INTEGER", std::regex("^(\\b[0-9]+\\b)") } );
    tokentypes.push_back( { "STRING", std::regex("^(\".*\")") } );
    tokentypes.push_back( { "OPAREN", std::regex("^(\\()") } );
    tokentypes.push_back( { "CPAREN", std::regex("^(\\))") } );
    tokentypes.push_back( { "COMMA", std::regex("^(,)") } );
    tokentypes.push_back( { "QUOTE", std::regex("^(')") } );
    tokentypes.push_back( { "EQUALS", std::regex("^(=)") } );
    this->code = code;
 }
 std::vector<PBToken> Tokenizer::tokenize() {
    std::vector<PBToken> tokens;
    code = ltrim(code);
    while(code.size() > 0) {
        tokens.push_back(tokenize_one_token());
        code = ltrim(code);
    }
    return tokens;
 }
 PBToken Tokenizer::tokenize_one_token() {
    PBToken tk;
    for(int tt = 0; tt < TOKEN_TYPE_COUNT; tt++) {
 #ifdef DEBUG
        std::cout << "attempting to match " << std::get<0>(tokentypes[tt]) << std::endl;
 #endif
        std::regex re = std::get<1>(tokentypes[tt]);
        std::smatch sm;
        if(std::regex_search(code, sm, re)) {
            tk.type = (PBTokenType) tt;
            tk.value = sm[0];
 #ifdef DEBUG
            std::cout << "match found" << std::endl;
            std::cout << "tk.type: " << std::get<0>(tokentypes[tk.type]) << std::endl << "tk.value: " << tk.value << std::endl;
 #endif
            code = code.substr(tk.value.length());
            return tk;
        }
    }
    throw std::runtime_error("Couldn't match token: " + code.substr(0, code.find_first_of(" \t\r\n")));
 }
 std::string Tokenizer::dump() {
    return this->code;
 }
--- a/tokenizer.hpp
+++ b/tokenizer.hpp
@ -0,0 +1,42 @@
 #pragma once
 #ifndef TOKENIZER_HPP
 #define TOKENIZER_HPP
 #include <regex>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 typedef enum {
    PREPROC,
    FUNCTION,
    SUB,
    END,
    AS,
    TYPE,
    IDENTIFIER,
    INTEGER,
    STRING,
    OPAREN,
    CPAREN,
    COMMA,
    QUOTE,
    EQUALS,
    TOKEN_TYPE_COUNT
 } PBTokenType;
 typedef struct {
    PBTokenType type;
    std::string value;
 } PBToken;
 class Tokenizer {
    std::vector<std::pair<std::string, std::regex>> tokentypes;
    std::string code;
 public:
    Tokenizer(std::string);
    std::vector<PBToken> tokenize();
    PBToken tokenize_one_token();
    std::string dump();
 };
 #endif
Author	SHA1	Message	Date
hiro	3f6ef19cca	feat: add build script	2025-12-24 12:40:07 -05:00
hiro	f2e188cf1f	wip: add parser stub, cleanup scratch comments in tokenizer	2025-12-22 21:41:23 -05:00
hiro	1e7ea750f6	wip: implement tokenizer in c++, ignore binaries	2025-12-22 18:39:09 -05:00
hiro	fb1059683f	wip: work on cpp tokenizer implementation	2025-12-21 18:20:47 -05:00
hiro	d0714f8664	split out tokenizer class	2025-11-18 16:24:59 -06:00
Andrew Montenigro	68ea89ece6	base c++ work	2025-11-18 09:12:59 -06:00
Andrew Montenigro	23e7faf186	remove c stub	2025-11-17 16:22:20 -06:00
Andrew	13ce1a72c1	chore: gitignore a.out	2025-03-26 00:05:18 -05:00
Andrew	acbdf1de5f	first pass transpiler implementation - pb2c.rb - Modified version of Gary Bernhardt's example compiler from scratch - Transpiles absolute barebones PowerBasic to C Compile with `./pb2c.rb \| gcc -xc -`	2025-03-26 00:04:16 -05:00