wip: implement tokenizer in c++, ignore binaries
This commit is contained in:
parent
fb1059683f
commit
1e7ea750f6
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,2 +1,4 @@
|
||||
a.out
|
||||
npb
|
||||
hello
|
||||
|
||||
|
||||
41
neopb.cpp
41
neopb.cpp
@ -1,8 +1,6 @@
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <regex>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "tokenizer.hpp"
|
||||
@ -16,17 +14,24 @@ std::string readFileContents(std::string fname) {
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
#ifdef DEBUG
|
||||
for (int n = 0; n < argc; n++) {
|
||||
std::cout << "arg" << n << ": " << argv[n] << std::endl;
|
||||
}
|
||||
#endif
|
||||
std::string infile = argc > 1 ? argv[1] : "";
|
||||
std::string code = "";
|
||||
if(infile.length() > 0) {
|
||||
code = readFileContents(infile);
|
||||
}
|
||||
Tokenizer tokenizer = Tokenizer(code);
|
||||
|
||||
#ifdef DEBUG
|
||||
std::cout << "code: " << tokenizer.dump() << std::endl;
|
||||
#endif
|
||||
|
||||
std::vector<PBToken> tokens = tokenizer.tokenize();
|
||||
|
||||
for(int i = 0; i < tokens.size(); i++) {
|
||||
std::cout << tokens[i].value << std::endl;
|
||||
}
|
||||
@ -34,35 +39,3 @@ int main(int argc, char* argv[]) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
//std::vector<PBToken> tokenize(std::string code) {
|
||||
// const PBToken tokenize_one = [](std::string fragment) {
|
||||
// //const std::unordered_map<PBTokenType, std::regex> tokentypes = std::unordered_map();
|
||||
// const std::regex re_func("\bfunction\b", std::regex_constants::icase);
|
||||
// const std::regex re_sub( "\bsub\b", std::regex_constants::icase);
|
||||
// const std::regex re_end( "\bend\b", std::regex_constants::icase);
|
||||
// const std::regex re_as("\bas\b", std::regex_constants::icase);
|
||||
// const std::regex re_type("\blong\b", std::regex_constants::icase);
|
||||
// const std::regex re_identifier("\b[a-zA-Z]+\b");
|
||||
// const std::regex re_integer("\b[0-9]+\b");
|
||||
// const std::regex re_string("\".*\"");
|
||||
// const std::regex re_oparen("\(");
|
||||
// const std::regex re_cparen("\)");
|
||||
// const std::regex re_comma(",");
|
||||
// const std::regex re_quote("'");
|
||||
// const std::regex re_equals("=");
|
||||
//
|
||||
// PBTokenType tt = SUB;
|
||||
// std::string val = fragment.trim();
|
||||
//
|
||||
//
|
||||
//
|
||||
// return { .type = tt, .value = val };
|
||||
// };
|
||||
// std::vector<PBToken> tokens();
|
||||
// while(code.length() > 0) {
|
||||
// int split = code.find(' ');
|
||||
// std::string fragment = split > 0 ? code.substr(0, split) : code;
|
||||
// tokens.push_back(fragment);
|
||||
// }
|
||||
// return tokens;
|
||||
//}
|
||||
|
||||
@ -1,17 +1,61 @@
|
||||
#include "tokenizer.hpp"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
static inline std::string <rim(std::string &s) {
|
||||
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int c) {return !std::isspace(c);}));
|
||||
return s;
|
||||
}
|
||||
|
||||
Tokenizer::Tokenizer(std::string code) {
|
||||
tokentypes = std::vector<std::pair<std::string, std::regex>>();
|
||||
tokentypes.push_back( { "PREPROC", std::regex("^(#[a-zA-Z]+\\b)") } );
|
||||
tokentypes.push_back( { "FUNCTION", std::regex("^(\\bfunction\\b)", std::regex_constants::icase) } );
|
||||
tokentypes.push_back( { "SUB", std::regex("^(\\bsub\\b)", std::regex_constants::icase) } );
|
||||
tokentypes.push_back( { "END", std::regex("^(\\bend\\b)", std::regex_constants::icase) } );
|
||||
tokentypes.push_back( { "AS", std::regex("^(\\bas\\b)", std::regex_constants::icase) } );
|
||||
tokentypes.push_back( { "TYPE", std::regex("^(\\blong\\b)", std::regex_constants::icase) } );
|
||||
tokentypes.push_back( { "IDENTIFIER", std::regex("^(\\b[a-zA-Z]+\\b)") } );
|
||||
tokentypes.push_back( { "INTEGER", std::regex("^(\\b[0-9]+\\b)") } );
|
||||
tokentypes.push_back( { "STRING", std::regex("^(\".*\")") } );
|
||||
tokentypes.push_back( { "OPAREN", std::regex("^(\\()") } );
|
||||
tokentypes.push_back( { "CPAREN", std::regex("^(\\))") } );
|
||||
tokentypes.push_back( { "COMMA", std::regex("^(,)") } );
|
||||
tokentypes.push_back( { "QUOTE", std::regex("^(')") } );
|
||||
tokentypes.push_back( { "EQUALS", std::regex("^(=)") } );
|
||||
this->code = code;
|
||||
}
|
||||
|
||||
std::vector<PBToken> Tokenizer::tokenize() {
|
||||
std::vector<PBToken> tokens;
|
||||
code = ltrim(code);
|
||||
while(code.size() > 0) {
|
||||
tokens.push_back(tokenize_one_token());
|
||||
code = ltrim(code);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
PBToken Tokenizer::tokenize_one_token() {
|
||||
return { .type = FUNCTION, .value = "Function" };
|
||||
PBToken tk;
|
||||
for(int tt = 0; tt < TOKEN_TYPE_COUNT; tt++) {
|
||||
#ifdef DEBUG
|
||||
std::cout << "attempting to match " << std::get<0>(tokentypes[tt]) << std::endl;
|
||||
#endif
|
||||
std::regex re = std::get<1>(tokentypes[tt]);
|
||||
std::smatch sm;
|
||||
if(std::regex_search(code, sm, re)) {
|
||||
tk.type = (PBTokenType) tt;
|
||||
tk.value = sm[0];
|
||||
#ifdef DEBUG
|
||||
std::cout << "match found" << std::endl;
|
||||
std::cout << "tk.type: " << std::get<0>(tokentypes[tk.type]) << std::endl << "tk.value: " << tk.value << std::endl;
|
||||
#endif
|
||||
code = code.substr(tk.value.length());
|
||||
return tk;
|
||||
}
|
||||
}
|
||||
throw std::runtime_error("Couldn't match token: " + code.substr(0, code.find_first_of(" \t\r\n")));
|
||||
}
|
||||
|
||||
std::string Tokenizer::dump() {
|
||||
|
||||
@ -4,8 +4,11 @@
|
||||
|
||||
#include <regex>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
typedef enum {
|
||||
PREPROC,
|
||||
FUNCTION,
|
||||
SUB,
|
||||
END,
|
||||
@ -28,7 +31,9 @@ typedef struct {
|
||||
} PBToken;
|
||||
|
||||
class Tokenizer {
|
||||
std::unordered_map<PBTokenType, std::regex> tokentypes;
|
||||
//std::unordered_map<PBTokenType, std::regex> tokentypes;
|
||||
std::vector<std::pair<std::string, std::regex>> tokentypes;
|
||||
//std::vector<std::regex> tokentypes;
|
||||
std::string code;
|
||||
public:
|
||||
Tokenizer(std::string);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user