Compiler
Lexer
The Lexer takes a string as input, removes irrelevant information like comments and whitespaces and outputs a list of tokens which can then be further processed by a Parser.
1) Define Tokens
| Type | Example | Regex |
|---|---|---|
ID | temp data | [a-zA-Z][a-zA-Z0-9]* |
NUM | 13 43 | [0-9]+ |
COMMA | , | , |
LPAREN | ( | ( |
Step 2: Define skip regexes
| Type | Example | Regex |
|---|---|---|
| Comment | # bla blubb | '#' [a-z]* '\n' |
| Whitespace | ' ' '\t' '\n' | (' '|'\t'|'\n')+ |
2) Type Definition
- C++
- Typescript
lexer.h
enum class TokenType {
kFloat,
kLParen,
kRParen,
kIf,
/* .. */
};
struct TokenDefinition {
TokenType type;
std::string regex;
};
struct Token {
TokenType type;
std::string value;
};
lexer.ts
export type TokenType =
"FLOAT"|
"LPAREN"|
"RPAREN"|
"IF"|
/* .. */
export interface TokenDefinition<T> {
type: T,
regex: string
}
export interface Token<T> {
type: T,
value: string
}
3) Define Token
- C++
- Typescript
main.cpp
#include <regex>
#include "lexer.h"
const std::vector<lexer::TokenDefinition> token_definitions{
{TokenType::kFloat, "(float)"},
{TokenType::kLParen, "(\\()"},
{TokenType::kRParen, "(\\))"},
{TokenType::kIf, "(if)"},
/* .. */
};
const std::regex skip_regex(" |\n|\r|\\/\\*(\\*(?!\\/)|[^*])*\\*\\/", std::regex::ECMAScript);
main.ts
import * as lexer from './lexer';
const token_definition:lexer.TokenDefinition<TokenType>[] = [
{type: "FLOAT", regex: "float"},
{type: "LPAREN", regex: "\\("},
{type: "RPAREN", regex: "\\)"},
{type: "IF", regex: "if"},
/* .. */
];
const skip_regex = " |\n|\r|\\/\\*(\\*(?!\\/)|[^*])*\\*\\/";
4) Implement Lexer
- C++
- Typescript
lexer.cpp
std::list<Token> lex(
std::string text,
const std::vector<lexer::TokenDefinition>& token_definitions,
const std::regex& skip_regex)
{
std::list<Token> tokens;
while (!text.empty()) {
std::smatch match;
if (std::regex_search(text, match, skip_regex)) {
text = match.suffix();
continue;
}
auto [token, text_after_token_consumed] = consume_token_(text);
if (token.type == TokenType::kUnknown) {
std::stringstream ss;
ss << "Got input that did not match any definition: " << text;
throw std::runtime_error(ss.str());
}
tokens.emplace_back(token);
text = std::move(text_after_token_consumed);
}
return tokens;
}
std::tuple<Token, std::string> consume_token_(
const std::string& text,
const std::vector<lexer::TokenDefinition>& token_definitions)
{
for (const auto& token_definition : token_definitions) {
std::smatch match;
std::regex regex(token_definition.regex, std::regex::ECMAScript);
if (std::regex_search(text, match, regex)) {
Token token{token_definition.type, match[1]};
return std::make_tuple(token, match.suffix());
}
}
return std::make_tuple(Token{TokenType::kUnknown, ""}, "");
}
lexer.ts
export const lex = <T>(
text:string,
tokenDefinitions:TokenDefinition<T>[],
nontokenDefinition:string):Token<T>[] => {
const tokens:Token<T>[] = []
const skipRegex = RegExp(nontokenDefinition, "y");
let currentSearchIndex = 0;
while (currentSearchIndex < text.length) {
skipRegex.lastIndex = currentSearchIndex;
if (skipRegex.test(text)) {
currentSearchIndex = skipRegex.lastIndex;
} else {
const { token, lastIndex } = findToken<T>(text, tokenDefinitions, currentSearchIndex);
if (token && lastIndex) {
tokens.push(token);
currentSearchIndex = lastIndex;
} else {
const prettyText = removeLineBreaksAndTabs(text.substr(currentSearchIndex));
throw Error(`Got input that did not match any definition: ${prettyText}`);
}
}
}
return tokens;
}
const findToken = <T>(
text:string,
tokenDefinitions:TokenDefinition<T>[],
lastIndex:number):TokenSearchResult<T> => {
for (const token of tokenDefinitions) {
const regExp = RegExp(token.regex, "y");
regExp.lastIndex = lastIndex;
const match = regExp.exec(text);
if (match) {
return {
token: { type: token.type, value: match[0] },
lastIndex: regExp.lastIndex
}
}
}
return {}
}
const removeLineBreaksAndTabs = (text:string) => {
return text.replace(/\n/g,"\\n").replace(/\t/g,"\\t");
}
interface TokenSearchResult<T> {
token?: Token<T>,
lastIndex?: number
}
5) Run Lexer
- C++
- Typescript
main.cpp
const std::string text =
"float match0(char *s)\n"
"{if (!strcmp(s, "0.0", 3))\n"
" return 0.0;\n"
"};";
auto tokens = lexer::lex(text, token_definition, skip_regex);
main.ts
const text = `
float match0(char *s)
{if (!strcmp(s, "0.0", 3))
return 0.0;
}`;
const tokens = lexer.lex(text, token_definition, skip_regex);
Parser
Recursive-Descent Parser
Use if first symbol of every grammer rule starts with terminal symbol.
S -> if E then S else S
S -> begin S L
S -> print E
L -> end
L -> ; S L
E -> num = num
Parser code:
final int IF=1, THEN=2, ELSE=3, BEGIN=4, END=5, PRINT=6, SEMI=7, NUM=8, EQ=9;
int tok = getToken();
void advance() {tok = getToken();}
void eat(int t) {if (tok==t) advance(); else error();}
void S() {
switch(tok) {
case IF: eat(IF); E(); eat(THEN); S(); eat(ELSE); S(); break;
case BEGIN: eat(BEGIN); S(); L();
case PRINT: eat(PRINT); E();
default : error();
}
}
void L() {
switch(tok) {
case END: eat(END); break;
case SEMI: eat(SEMI); S(); L(); break;
}
}
void E() {eat(NUM); eat(EQ); eat(NUM);}