wip: work on cpp tokenizer implementation

This commit is contained in:
hiro 2025-12-21 18:20:47 -05:00
parent d0714f8664
commit fb1059683f
4 changed files with 80 additions and 36 deletions

View File

@ -1,3 +1,4 @@
#include <fstream>
#include <iostream> #include <iostream>
#include <regex> #include <regex>
#include <string> #include <string>
@ -8,43 +9,60 @@
std::vector<PBToken> tokenize(std::string code); std::vector<PBToken> tokenize(std::string code);
std::string readFileContents(std::string fname) {
std::ifstream ifs(fname);
std::string contents( (std::istreambuf_iterator<char>(ifs)), (std::istreambuf_iterator<char>()) );
return contents;
}
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
//for (int n = 0; n < argc; n++) { for (int n = 0; n < argc; n++) {
// std::cout << "arg" << n << ": " << argv[n] << std::endl; std::cout << "arg" << n << ": " << argv[n] << std::endl;
//} }
std::string infile = argc > 1 ? argv[1] : "";
std::string code = "";
if(infile.length() > 0) {
code = readFileContents(infile);
}
Tokenizer tokenizer = Tokenizer(code);
std::cout << "code: " << tokenizer.dump() << std::endl;
std::vector<PBToken> tokens = tokenizer.tokenize();
for(int i = 0; i < tokens.size(); i++) {
std::cout << tokens[i].value << std::endl;
}
return 0; return 0;
} }
std::vector<PBToken> tokenize(std::string code) { //std::vector<PBToken> tokenize(std::string code) {
const PBToken tokenize_one = [](std::string fragment) { // const PBToken tokenize_one = [](std::string fragment) {
//const std::unordered_map<PBTokenType, std::regex> tokentypes = std::unordered_map(); // //const std::unordered_map<PBTokenType, std::regex> tokentypes = std::unordered_map();
const std::regex re_func("\bfunction\b", std::regex_constants::icase); // const std::regex re_func("\bfunction\b", std::regex_constants::icase);
const std::regex re_sub( "\bsub\b", std::regex_constants::icase); // const std::regex re_sub( "\bsub\b", std::regex_constants::icase);
const std::regex re_end( "\bend\b", std::regex_constants::icase); // const std::regex re_end( "\bend\b", std::regex_constants::icase);
const std::regex re_as("\bas\b", std::regex_constants::icase); // const std::regex re_as("\bas\b", std::regex_constants::icase);
const std::regex re_type("\blong\b", std::regex_constants::icase); // const std::regex re_type("\blong\b", std::regex_constants::icase);
const std::regex re_identifier("\b[a-zA-Z]+\b"); // const std::regex re_identifier("\b[a-zA-Z]+\b");
const std::regex re_integer("\b[0-9]+\b"); // const std::regex re_integer("\b[0-9]+\b");
const std::regex re_string("\".*\""); // const std::regex re_string("\".*\"");
const std::regex re_oparen("\("); // const std::regex re_oparen("\(");
const std::regex re_cparen("\)"); // const std::regex re_cparen("\)");
const std::regex re_comma(","); // const std::regex re_comma(",");
const std::regex re_quote("'"); // const std::regex re_quote("'");
const std::regex re_equals("="); // const std::regex re_equals("=");
//
PBTokenType tt = SUB; // PBTokenType tt = SUB;
std::string val = fragment.trim(); // std::string val = fragment.trim();
//
//
//
return { .type = tt, .value = val }; // return { .type = tt, .value = val };
}; // };
std::vector<PBToken> tokens(); // std::vector<PBToken> tokens();
while(code.length() > 0) { // while(code.length() > 0) {
int split = code.find(' '); // int split = code.find(' ');
std::string fragment = split > 0 ? code.substr(0, split) : code; // std::string fragment = split > 0 ? code.substr(0, split) : code;
tokens.push_back(fragment); // tokens.push_back(fragment);
} // }
return tokens; // return tokens;
} //}

View File

@ -2,6 +2,7 @@
class Tokenizer class Tokenizer
TOKEN_TYPES = [ TOKEN_TYPES = [
[:preproc, /#[a-zA-Z]+\b/i],
[:function, /\bfunction\b/i], [:function, /\bfunction\b/i],
[:sub, /\bsub\b/i], [:sub, /\bsub\b/i],
[:end, /\bend\b/i], [:end, /\bend\b/i],
@ -25,6 +26,7 @@ class Tokenizer
until @code.empty? until @code.empty?
tokens << tokenize_one_token tokens << tokenize_one_token
@code = @code.strip @code = @code.strip
#puts tokens.join("\n")
end end
rescue RuntimeError => e rescue RuntimeError => e
puts tokens.join("\n") puts tokens.join("\n")
@ -58,6 +60,9 @@ class Parser
parse_function parse_function
end end
def parse_preproc
end
def parse_function def parse_function
consume(:function) consume(:function)
name = consume(:identifier).value name = consume(:identifier).value
@ -146,7 +151,8 @@ class Generator
end end
end end
tokens = Tokenizer.new(File.read("hello.bas")).tokenize #tokens = Tokenizer.new(File.read("hello.bas")).tokenize
tokens = Tokenizer.new(File.read(ARGV[0])).tokenize
#puts "Tokens:\n" #puts "Tokens:\n"
#puts tokens.join("\n") #puts tokens.join("\n")
tree = Parser.new(tokens).parse tree = Parser.new(tokens).parse

View File

@ -0,0 +1,19 @@
#include "tokenizer.hpp"
Tokenizer::Tokenizer(std::string code) {
this->code = code;
}
std::vector<PBToken> Tokenizer::tokenize() {
std::vector<PBToken> tokens;
tokens.push_back(tokenize_one_token());
return tokens;
}
PBToken Tokenizer::tokenize_one_token() {
return { .type = FUNCTION, .value = "Function" };
}
std::string Tokenizer::dump() {
return this->code;
}

View File

@ -34,5 +34,6 @@ public:
Tokenizer(std::string); Tokenizer(std::string);
std::vector<PBToken> tokenize(); std::vector<PBToken> tokenize();
PBToken tokenize_one_token(); PBToken tokenize_one_token();
std::string dump();
}; };
#endif #endif