Skip to main content

Compiler

Lexer

The Lexer takes a string as input, removes irrelevant information like comments and whitespaces and outputs a list of tokens which can then be further processed by a Parser.

1) Define Tokens

TypeExampleRegex
IDtemp data[a-zA-Z][a-zA-Z0-9]*
NUM13 43[0-9]+
COMMA,,
LPAREN((

Step 2: Define skip regexes

TypeExampleRegex
Comment# bla blubb'#' [a-z]* '\n'
Whitespace' ' '\t' '\n'(' '|'\t'|'\n')+

2) Type Definition

lexer.h
enum class TokenType {
kFloat,
kLParen,
kRParen,
kIf,
/* .. */
};

struct TokenDefinition {
TokenType type;
std::string regex;
};

struct Token {
TokenType type;
std::string value;
};

3) Define Token

main.cpp
#include <regex>

#include "lexer.h"

const std::vector<lexer::TokenDefinition> token_definitions{
{TokenType::kFloat, "(float)"},
{TokenType::kLParen, "(\\()"},
{TokenType::kRParen, "(\\))"},
{TokenType::kIf, "(if)"},
/* .. */
};

const std::regex skip_regex(" |\n|\r|\\/\\*(\\*(?!\\/)|[^*])*\\*\\/", std::regex::ECMAScript);

4) Implement Lexer

lexer.cpp
std::list<Token> lex(
std::string text,
const std::vector<lexer::TokenDefinition>& token_definitions,
const std::regex& skip_regex)
{
std::list<Token> tokens;
while (!text.empty()) {
std::smatch match;
if (std::regex_search(text, match, skip_regex)) {
text = match.suffix();
continue;
}
auto [token, text_after_token_consumed] = consume_token_(text);
if (token.type == TokenType::kUnknown) {
std::stringstream ss;
ss << "Got input that did not match any definition: " << text;
throw std::runtime_error(ss.str());
}
tokens.emplace_back(token);
text = std::move(text_after_token_consumed);
}
return tokens;
}

std::tuple<Token, std::string> consume_token_(
const std::string& text,
const std::vector<lexer::TokenDefinition>& token_definitions)
{
for (const auto& token_definition : token_definitions) {
std::smatch match;
std::regex regex(token_definition.regex, std::regex::ECMAScript);
if (std::regex_search(text, match, regex)) {
Token token{token_definition.type, match[1]};
return std::make_tuple(token, match.suffix());
}
}
return std::make_tuple(Token{TokenType::kUnknown, ""}, "");
}

5) Run Lexer

main.cpp
const std::string text = 
"float match0(char *s)\n"
"{if (!strcmp(s, "0.0", 3))\n"
" return 0.0;\n"
"};";
auto tokens = lexer::lex(text, token_definition, skip_regex);

Parser

Recursive-Descent Parser

Use if first symbol of every grammer rule starts with terminal symbol.

S -> if E then S else S
S -> begin S L
S -> print E
L -> end
L -> ; S L
E -> num = num

Parser code:

final int IF=1, THEN=2, ELSE=3, BEGIN=4, END=5, PRINT=6, SEMI=7, NUM=8, EQ=9;

int tok = getToken();
void advance() {tok = getToken();}
void eat(int t) {if (tok==t) advance(); else error();}

void S() {
switch(tok) {
case IF: eat(IF); E(); eat(THEN); S(); eat(ELSE); S(); break;
case BEGIN: eat(BEGIN); S(); L();
case PRINT: eat(PRINT); E();
default : error();
}
}
void L() {
switch(tok) {
case END: eat(END); break;
case SEMI: eat(SEMI); S(); L(); break;
}
}
void E() {eat(NUM); eat(EQ); eat(NUM);}

Reference