split out tokenizer class

This commit is contained in:
hiro 2025-11-18 16:24:59 -06:00
parent 68ea89ece6
commit d0714f8664
3 changed files with 41 additions and 22 deletions

View File

@ -4,27 +4,7 @@
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
typedef enum { #include "tokenizer.hpp"
FUNCTION,
SUB,
END,
AS,
TYPE,
IDENTIFIER,
INTEGER,
STRING,
OPAREN,
CPAREN,
COMMA,
QUOTE,
EQUALS,
TOKEN_TYPE_COUNT
} PBTokenType;
typedef struct {
PBTokenType type;
std::string value;
} PBToken;
std::vector<PBToken> tokenize(std::string code); std::vector<PBToken> tokenize(std::string code);
@ -38,6 +18,7 @@ int main(int argc, char* argv[]) {
std::vector<PBToken> tokenize(std::string code) { std::vector<PBToken> tokenize(std::string code) {
const PBToken tokenize_one = [](std::string fragment) { const PBToken tokenize_one = [](std::string fragment) {
//const std::unordered_map<PBTokenType, std::regex> tokentypes = std::unordered_map();
const std::regex re_func("\bfunction\b", std::regex_constants::icase); const std::regex re_func("\bfunction\b", std::regex_constants::icase);
const std::regex re_sub( "\bsub\b", std::regex_constants::icase); const std::regex re_sub( "\bsub\b", std::regex_constants::icase);
const std::regex re_end( "\bend\b", std::regex_constants::icase); const std::regex re_end( "\bend\b", std::regex_constants::icase);
@ -63,7 +44,7 @@ std::vector<PBToken> tokenize(std::string code) {
while(code.length() > 0) { while(code.length() > 0) {
int split = code.find(' '); int split = code.find(' ');
std::string fragment = split > 0 ? code.substr(0, split) : code; std::string fragment = split > 0 ? code.substr(0, split) : code;
tokens.push_back(tokenize_one(fragment)); tokens.push_back(fragment);
} }
return tokens; return tokens;
} }

0
tokenizer.cpp Normal file
View File

38
tokenizer.hpp Normal file
View File

@ -0,0 +1,38 @@
#pragma once
#ifndef TOKENIZER_HPP
#define TOKENIZER_HPP
#include <regex>
#include <unordered_map>
typedef enum {
FUNCTION,
SUB,
END,
AS,
TYPE,
IDENTIFIER,
INTEGER,
STRING,
OPAREN,
CPAREN,
COMMA,
QUOTE,
EQUALS,
TOKEN_TYPE_COUNT
} PBTokenType;
typedef struct {
PBTokenType type;
std::string value;
} PBToken;
class Tokenizer {
std::unordered_map<PBTokenType, std::regex> tokentypes;
std::string code;
public:
Tokenizer(std::string);
std::vector<PBToken> tokenize();
PBToken tokenize_one_token();
};
#endif