wip: work on cpp tokenizer implementation
This commit is contained in:
parent
d0714f8664
commit
fb1059683f
88
neopb.cpp
88
neopb.cpp
@ -1,3 +1,4 @@
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <regex>
|
||||
#include <string>
|
||||
@ -8,43 +9,60 @@
|
||||
|
||||
std::vector<PBToken> tokenize(std::string code);
|
||||
|
||||
std::string readFileContents(std::string fname) {
|
||||
std::ifstream ifs(fname);
|
||||
std::string contents( (std::istreambuf_iterator<char>(ifs)), (std::istreambuf_iterator<char>()) );
|
||||
return contents;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
//for (int n = 0; n < argc; n++) {
|
||||
// std::cout << "arg" << n << ": " << argv[n] << std::endl;
|
||||
//}
|
||||
for (int n = 0; n < argc; n++) {
|
||||
std::cout << "arg" << n << ": " << argv[n] << std::endl;
|
||||
}
|
||||
std::string infile = argc > 1 ? argv[1] : "";
|
||||
std::string code = "";
|
||||
if(infile.length() > 0) {
|
||||
code = readFileContents(infile);
|
||||
}
|
||||
Tokenizer tokenizer = Tokenizer(code);
|
||||
std::cout << "code: " << tokenizer.dump() << std::endl;
|
||||
std::vector<PBToken> tokens = tokenizer.tokenize();
|
||||
for(int i = 0; i < tokens.size(); i++) {
|
||||
std::cout << tokens[i].value << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::vector<PBToken> tokenize(std::string code) {
|
||||
const PBToken tokenize_one = [](std::string fragment) {
|
||||
//const std::unordered_map<PBTokenType, std::regex> tokentypes = std::unordered_map();
|
||||
const std::regex re_func("\bfunction\b", std::regex_constants::icase);
|
||||
const std::regex re_sub( "\bsub\b", std::regex_constants::icase);
|
||||
const std::regex re_end( "\bend\b", std::regex_constants::icase);
|
||||
const std::regex re_as("\bas\b", std::regex_constants::icase);
|
||||
const std::regex re_type("\blong\b", std::regex_constants::icase);
|
||||
const std::regex re_identifier("\b[a-zA-Z]+\b");
|
||||
const std::regex re_integer("\b[0-9]+\b");
|
||||
const std::regex re_string("\".*\"");
|
||||
const std::regex re_oparen("\(");
|
||||
const std::regex re_cparen("\)");
|
||||
const std::regex re_comma(",");
|
||||
const std::regex re_quote("'");
|
||||
const std::regex re_equals("=");
|
||||
|
||||
PBTokenType tt = SUB;
|
||||
std::string val = fragment.trim();
|
||||
|
||||
|
||||
|
||||
return { .type = tt, .value = val };
|
||||
};
|
||||
std::vector<PBToken> tokens();
|
||||
while(code.length() > 0) {
|
||||
int split = code.find(' ');
|
||||
std::string fragment = split > 0 ? code.substr(0, split) : code;
|
||||
tokens.push_back(fragment);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
//std::vector<PBToken> tokenize(std::string code) {
|
||||
// const PBToken tokenize_one = [](std::string fragment) {
|
||||
// //const std::unordered_map<PBTokenType, std::regex> tokentypes = std::unordered_map();
|
||||
// const std::regex re_func("\bfunction\b", std::regex_constants::icase);
|
||||
// const std::regex re_sub( "\bsub\b", std::regex_constants::icase);
|
||||
// const std::regex re_end( "\bend\b", std::regex_constants::icase);
|
||||
// const std::regex re_as("\bas\b", std::regex_constants::icase);
|
||||
// const std::regex re_type("\blong\b", std::regex_constants::icase);
|
||||
// const std::regex re_identifier("\b[a-zA-Z]+\b");
|
||||
// const std::regex re_integer("\b[0-9]+\b");
|
||||
// const std::regex re_string("\".*\"");
|
||||
// const std::regex re_oparen("\(");
|
||||
// const std::regex re_cparen("\)");
|
||||
// const std::regex re_comma(",");
|
||||
// const std::regex re_quote("'");
|
||||
// const std::regex re_equals("=");
|
||||
//
|
||||
// PBTokenType tt = SUB;
|
||||
// std::string val = fragment.trim();
|
||||
//
|
||||
//
|
||||
//
|
||||
// return { .type = tt, .value = val };
|
||||
// };
|
||||
// std::vector<PBToken> tokens();
|
||||
// while(code.length() > 0) {
|
||||
// int split = code.find(' ');
|
||||
// std::string fragment = split > 0 ? code.substr(0, split) : code;
|
||||
// tokens.push_back(fragment);
|
||||
// }
|
||||
// return tokens;
|
||||
//}
|
||||
|
||||
8
pb2c.rb
8
pb2c.rb
@ -2,6 +2,7 @@
|
||||
|
||||
class Tokenizer
|
||||
TOKEN_TYPES = [
|
||||
[:preproc, /#[a-zA-Z]+\b/i],
|
||||
[:function, /\bfunction\b/i],
|
||||
[:sub, /\bsub\b/i],
|
||||
[:end, /\bend\b/i],
|
||||
@ -25,6 +26,7 @@ class Tokenizer
|
||||
until @code.empty?
|
||||
tokens << tokenize_one_token
|
||||
@code = @code.strip
|
||||
#puts tokens.join("\n")
|
||||
end
|
||||
rescue RuntimeError => e
|
||||
puts tokens.join("\n")
|
||||
@ -58,6 +60,9 @@ class Parser
|
||||
parse_function
|
||||
end
|
||||
|
||||
def parse_preproc
|
||||
end
|
||||
|
||||
def parse_function
|
||||
consume(:function)
|
||||
name = consume(:identifier).value
|
||||
@ -146,7 +151,8 @@ class Generator
|
||||
end
|
||||
end
|
||||
|
||||
tokens = Tokenizer.new(File.read("hello.bas")).tokenize
|
||||
#tokens = Tokenizer.new(File.read("hello.bas")).tokenize
|
||||
tokens = Tokenizer.new(File.read(ARGV[0])).tokenize
|
||||
#puts "Tokens:\n"
|
||||
#puts tokens.join("\n")
|
||||
tree = Parser.new(tokens).parse
|
||||
|
||||
@ -0,0 +1,19 @@
|
||||
#include "tokenizer.hpp"
|
||||
|
||||
Tokenizer::Tokenizer(std::string code) {
|
||||
this->code = code;
|
||||
}
|
||||
|
||||
std::vector<PBToken> Tokenizer::tokenize() {
|
||||
std::vector<PBToken> tokens;
|
||||
tokens.push_back(tokenize_one_token());
|
||||
return tokens;
|
||||
}
|
||||
|
||||
PBToken Tokenizer::tokenize_one_token() {
|
||||
return { .type = FUNCTION, .value = "Function" };
|
||||
}
|
||||
|
||||
std::string Tokenizer::dump() {
|
||||
return this->code;
|
||||
}
|
||||
@ -34,5 +34,6 @@ public:
|
||||
Tokenizer(std::string);
|
||||
std::vector<PBToken> tokenize();
|
||||
PBToken tokenize_one_token();
|
||||
std::string dump();
|
||||
};
|
||||
#endif
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user