Compiler

Lexer

The Lexer takes a string as input, removes irrelevant information like comments and whitespaces and outputs a list of tokens which can then be further processed by a Parser.

1) Define Tokens

Type	Example	Regex
`ID`	temp data	[a-zA-Z][a-zA-Z0-9]*
`NUM`	13 43	[0-9]+
`COMMA`	,	,
`LPAREN`	(	(

Step 2: Define skip regexes

Type	Example	Regex
Comment	# bla blubb	'#' [a-z]* '\n'
Whitespace	' ' '\t' '\n'	(' '\|'\t'\|'\n')+

2) Type Definition

C++
Typescript

lexer.h
enum class TokenType {
    kFloat,
    kLParen,
    kRParen,
    kIf,
    /* .. */
};

struct TokenDefinition {
    TokenType type;
    std::string regex;
};

struct Token {
    TokenType type;
    std::string value;
};

lexer.ts
export type TokenType = 
    "FLOAT"|
    "LPAREN"|
    "RPAREN"|
    "IF"|
    /* .. */

export interface TokenDefinition<T> {
    type: T,
    regex: string
}

export interface Token<T> {
    type: T,
    value: string
}

3) Define Token

C++
Typescript

main.cpp
#include <regex>

#include "lexer.h"

const std::vector<lexer::TokenDefinition> token_definitions{
    {TokenType::kFloat, "(float)"},
    {TokenType::kLParen, "(\\()"},
    {TokenType::kRParen, "(\\))"},
    {TokenType::kIf, "(if)"},
    /* .. */
};

const std::regex skip_regex(" |\n|\r|\\/\\*(\\*(?!\\/)|[^*])*\\*\\/", std::regex::ECMAScript);

main.ts
import * as lexer from './lexer';

const token_definition:lexer.TokenDefinition<TokenType>[] = [
    {type: "FLOAT", regex: "float"},
    {type: "LPAREN", regex: "\\("},
    {type: "RPAREN", regex: "\\)"},
    {type: "IF", regex: "if"},
    /* .. */
];
const skip_regex = " |\n|\r|\\/\\*(\\*(?!\\/)|[^*])*\\*\\/";

4) Implement Lexer

C++
Typescript

lexer.cpp
std::list<Token> lex(
    std::string text,
    const std::vector<lexer::TokenDefinition>& token_definitions,
    const std::regex& skip_regex)
{
    std::list<Token> tokens;
    while (!text.empty()) {
        std::smatch match;
        if (std::regex_search(text, match, skip_regex)) {
            text = match.suffix();
            continue;
        }
        auto [token, text_after_token_consumed] = consume_token_(text);
        if (token.type == TokenType::kUnknown) {
            std::stringstream ss;
            ss << "Got input that did not match any definition: " << text;
            throw std::runtime_error(ss.str());
        }
        tokens.emplace_back(token);
        text = std::move(text_after_token_consumed);
    }
    return tokens;
}

std::tuple<Token, std::string> consume_token_(
    const std::string& text,
    const std::vector<lexer::TokenDefinition>& token_definitions)
{
    for (const auto& token_definition : token_definitions) {
        std::smatch match;
        std::regex regex(token_definition.regex, std::regex::ECMAScript);
        if (std::regex_search(text, match, regex)) {
            Token token{token_definition.type, match[1]};
            return std::make_tuple(token, match.suffix());
        }
    }
    return std::make_tuple(Token{TokenType::kUnknown, ""}, "");
}

lexer.ts
export const lex = <T>(
        text:string,
        tokenDefinitions:TokenDefinition<T>[],
        nontokenDefinition:string):Token<T>[] => {
    const tokens:Token<T>[] = []
    const skipRegex = RegExp(nontokenDefinition, "y");
    let currentSearchIndex = 0;
    while (currentSearchIndex < text.length) {
        skipRegex.lastIndex = currentSearchIndex;
        if (skipRegex.test(text)) {
            currentSearchIndex = skipRegex.lastIndex;
        } else {
            const { token, lastIndex } = findToken<T>(text, tokenDefinitions, currentSearchIndex);
            if (token && lastIndex) {
                tokens.push(token);
                currentSearchIndex = lastIndex;
            } else {
                const prettyText = removeLineBreaksAndTabs(text.substr(currentSearchIndex));
                throw Error(`Got input that did not match any definition: ${prettyText}`);
            }
        }
    }
    return tokens;
}

const findToken = <T>(
        text:string,
        tokenDefinitions:TokenDefinition<T>[],
        lastIndex:number):TokenSearchResult<T> => {
    for (const token of tokenDefinitions) {
        const regExp = RegExp(token.regex, "y");
        regExp.lastIndex = lastIndex;
        const match = regExp.exec(text);
        if (match) {
            return {
                token: { type: token.type, value: match[0] },
                lastIndex: regExp.lastIndex
            }
        }
    }
    return {}
}

const removeLineBreaksAndTabs = (text:string) => {
    return text.replace(/\n/g,"\\n").replace(/\t/g,"\\t");
}

interface TokenSearchResult<T> {
    token?: Token<T>,
    lastIndex?: number
}

5) Run Lexer

C++
Typescript

main.cpp
const std::string text = 
    "float match0(char *s)\n"
    "{if (!strcmp(s, "0.0", 3))\n"
    "    return 0.0;\n"
    "};";
auto tokens = lexer::lex(text, token_definition, skip_regex);

main.ts
const text = `
    float match0(char *s)
    {if (!strcmp(s, "0.0", 3))
        return 0.0;
    }`;
const tokens = lexer.lex(text, token_definition, skip_regex);

Parser

Recursive-Descent Parser

Use if first symbol of every grammer rule starts with terminal symbol.

S -> if E then S else S
S -> begin S L
S -> print E
L -> end
L -> ; S L
E -> num = num

Parser code:

final int IF=1, THEN=2, ELSE=3, BEGIN=4, END=5, PRINT=6, SEMI=7, NUM=8, EQ=9;

int tok = getToken();
void advance() {tok = getToken();}
void eat(int t) {if (tok==t) advance(); else error();}

void S() {
    switch(tok) {
        case IF: eat(IF); E(); eat(THEN); S(); eat(ELSE); S(); break;
        case BEGIN: eat(BEGIN); S(); L();
        case PRINT: eat(PRINT); E();
        default : error();
    }
}
void L() {
    switch(tok) {
        case END: eat(END); break;
        case SEMI: eat(SEMI); S(); L(); break;
    }
}
void E() {eat(NUM); eat(EQ); eat(NUM);}

Reference

Moder Compiler Implementation in Java by Andrew Appel

Compiler

Lexer​

1) Define Tokens​

2) Type Definition​

3) Define Token​

4) Implement Lexer​

5) Run Lexer​

Parser​

Recursive-Descent Parser​