Lexing

On this page

Table of Contents

The lexer (also called a tokenizer or scanner) takes Monkey source code as a string and breaks it into a sequence of tokens. Each token is the smallest meaningful unit of the language: a keyword, an identifier, a number, an operator.

This chapter builds the lexer across two files: token.zig for token types and keyword lookup, and lexer.zig for the lexer itself.

Token definition (`token.zig`)
#

We use an enum for token types. Type safe and fast since it compiles down to integer comparison instead of string comparison.

Complete `token.zig`
#

const std = @import("std");

pub const TokenType = enum {
    // Special.
    illegal,
    eof,

    // Identifiers and literals.
    ident,
    int,
    string,

    // Operators.
    assign,
    plus,
    minus,
    bang,
    asterisk,
    slash,
    lt,
    gt,
    eq,
    not_eq,

    // Delimiters.
    comma,
    semicolon,
    colon,
    lparen,
    rparen,
    lbrace,
    rbrace,
    lbracket,
    rbracket,

    // Keywords.
    function,
    let_,
    true_,
    false_,
    if_,
    else_,
    return_,
};

pub const Token = struct {
    type: TokenType,
    literal: []const u8,
};

/// Checks if an identifier is a keyword.
pub fn lookupIdent(ident: []const u8) TokenType {
    const keywords = std.StaticStringMap(TokenType).initComptime(.{
        .{ "fn", .function },
        .{ "let", .let_ },
        .{ "true", .true_ },
        .{ "false", .false_ },
        .{ "if", .if_ },
        .{ "else", .else_ },
        .{ "return", .return_ },
    });

    return keywords.get(ident) orelse .ident;
}

test "lookupIdent keywords" {
    try std.testing.expectEqual(TokenType.let_, lookupIdent("let"));
    try std.testing.expectEqual(TokenType.function, lookupIdent("fn"));
    try std.testing.expectEqual(TokenType.return_, lookupIdent("return"));
    try std.testing.expectEqual(TokenType.ident, lookupIdent("foobar"));
    try std.testing.expectEqual(TokenType.ident, lookupIdent("x"));
}

Trailing underscores: let, true, false, if, else, return are Zig keywords, so we append _ to avoid conflicts.

Lexer (`lexer.zig`)
#

The lexer holds a reference to the input string and tracks its position as it scans through characters.

Complete `lexer.zig`
#

const std = @import("std");
const token = @import("token.zig");

pub const Lexer = struct {
    input: []const u8,
    position: usize,
    read_position: usize,
    ch: u8,

    /// Creates a new lexer for a given input string.
    pub fn init(input: []const u8) Lexer {
        var l = Lexer{
            .input = input,
            .position = 0,
            .read_position = 0,
            .ch = 0,
        };
        l.readChar();
        return l;
    }

    /// Advances to the next character in the input.
    fn readChar(self: *Lexer) void {
        if (self.read_position >= self.input.len) {
            self.ch = 0;
        } else {
            self.ch = self.input[self.read_position];
        }
        self.position = self.read_position;
        self.read_position += 1;
    }

    /// Looks at the next character without advancing the position.
    fn peekChar(self: *Lexer) u8 {
        if (self.read_position >= self.input.len) {
            return 0;
        }

        return self.input[self.read_position];
    }

    /// Reads the next token from the input.
    pub fn nextToken(self: *Lexer) token.Token {
        self.skipWhitespace();

        const tok: token.Token = switch (self.ch) {
            '=' => blk: {
                if (self.peekChar() == '=') {
                    const start = self.position;
                    self.readChar();
                    break :blk .{ .type = .eq, .literal = self.input[start .. self.position + 1] };
                }
                break :blk .{ .type = .assign, .literal = "=" };
            },
            '+' => .{ .type = .plus, .literal = "+" },
            '-' => .{ .type = .minus, .literal = "-" },
            '!' => blk: {
                if (self.peekChar() == '=') {
                    const start = self.position;
                    self.readChar();
                    break :blk .{ .type = .not_eq, .literal = self.input[start .. self.position + 1] };
                }
                break :blk .{ .type = .bang, .literal = "!" };
            },
            '*' => .{ .type = .asterisk, .literal = "*" },
            '/' => .{ .type = .slash, .literal = "/" },
            '<' => .{ .type = .lt, .literal = "<" },
            '>' => .{ .type = .gt, .literal = ">" },
            ',' => .{ .type = .comma, .literal = "," },
            ';' => .{ .type = .semicolon, .literal = ";" },
            ':' => .{ .type = .colon, .literal = ":" },
            '(' => .{ .type = .lparen, .literal = "(" },
            ')' => .{ .type = .rparen, .literal = ")" },
            '{' => .{ .type = .lbrace, .literal = "{" },
            '}' => .{ .type = .rbrace, .literal = "}" },
            '[' => .{ .type = .lbracket, .literal = "[" },
            ']' => .{ .type = .rbracket, .literal = "]" },
            '"' => .{ .type = .string, .literal = self.readString() },
            0 => .{ .type = .eof, .literal = "" },
            else => {
                if (isLetter(self.ch)) {
                    const literal = self.readIdentifier();
                    return .{ .type = token.lookupIdent(literal), .literal = literal };
                } else if (isDigit(self.ch)) {
                    return .{ .type = .int, .literal = self.readNumber() };
                } else {
                    return .{ .type = .illegal, .literal = self.input[self.position .. self.position + 1] };
                }
            },
        };

        self.readChar();
        return tok;
    }

    /// Reads an identifier (letters and underscores).
    fn readIdentifier(self: *Lexer) []const u8 {
        const start = self.position;
        while (isLetter(self.ch)) {
            self.readChar();
        }
        return self.input[start..self.position];
    }

    /// Reads a number (Monkey doesn't use floats).
    fn readNumber(self: *Lexer) []const u8 {
        const start = self.position;
        while (isDigit(self.ch)) {
            self.readChar();
        }

        return self.input[start..self.position];
    }

    /// Reads a string literal between double quotes.
    /// Returns the content without the surrounding quotes.
    fn readString(self: *Lexer) []const u8 {
        const start = self.position + 1; // skip opening quote
        while (true) {
            self.readChar();
            if (self.ch == '"' or self.ch == 0) break;
        }

        return self.input[start..self.position];
    }

    /// Skips whitespace characters (spaces, tabs, newlines, carriage returns).
    fn skipWhitespace(self: *Lexer) void {
        while (self.ch == ' ' or self.ch == '\t' or self.ch == '\n' or self.ch == '\r') {
            self.readChar();
        }
    }
};

fn isLetter(ch: u8) bool {
    return (ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z') or ch == '_';
}

fn isDigit(ch: u8) bool {
    return ch >= '0' and ch <= '9';
}

Tests
#

Basic token test
#

test "next token, basic symbols" {
    const input = "=+(){},;";
    var l = Lexer.init(input);

    const expected = [_]struct { expected_type: token.TokenType, expected_literal: []const u8 }{
        .{ .expected_type = .assign, .expected_literal = "=" },
        .{ .expected_type = .plus, .expected_literal = "+" },
        .{ .expected_type = .lparen, .expected_literal = "(" },
        .{ .expected_type = .rparen, .expected_literal = ")" },
        .{ .expected_type = .lbrace, .expected_literal = "{" },
        .{ .expected_type = .rbrace, .expected_literal = "}" },
        .{ .expected_type = .comma, .expected_literal = "," },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        .{ .expected_type = .eof, .expected_literal = "" },
    };

    for (expected) |tt| {
        const tok = l.nextToken();
        try std.testing.expectEqual(tt.expected_type, tok.type);
        try std.testing.expectEqualStrings(tt.expected_literal, tok.literal);
    }
}

Comprehensive test
#

This is the main lexer test from Thorsten Ball’s book. It tokenizes a complete Monkey program and verifies every token.

test "next token, complete program" {
    const input =
        \\let five = 5;
        \\let ten = 10;
        \\
        \\let add = fn(x, y) {
        \\  x + y;
        \\};
        \\
        \\let result = add(five, ten);
        \\!-/*5;
        \\5 < 10 > 5;
        \\
        \\if (5 < 10) {
        \\  return true;
        \\} else {
        \\  return false;
        \\}
        \\
        \\10 == 10;
        \\10 != 9;
        \\"foobar"
        \\"foo bar"
        \\[1, 2];
        \\{"foo": "bar"}
    ;
    var l = Lexer.init(input);

    // Comments below explain what code being scanned.
    const expected = [_]struct { expected_type: token.TokenType, expected_literal: []const u8 }{
        // let five = 5;
        .{ .expected_type = .let_, .expected_literal = "let" },
        .{ .expected_type = .ident, .expected_literal = "five" },
        .{ .expected_type = .assign, .expected_literal = "=" },
        .{ .expected_type = .int, .expected_literal = "5" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        // let ten = 10;
        .{ .expected_type = .let_, .expected_literal = "let" },
        .{ .expected_type = .ident, .expected_literal = "ten" },
        .{ .expected_type = .assign, .expected_literal = "=" },
        .{ .expected_type = .int, .expected_literal = "10" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        // let add = fn(x, y) { x + y; };
        .{ .expected_type = .let_, .expected_literal = "let" },
        .{ .expected_type = .ident, .expected_literal = "add" },
        .{ .expected_type = .assign, .expected_literal = "=" },
        .{ .expected_type = .function, .expected_literal = "fn" },
        .{ .expected_type = .lparen, .expected_literal = "(" },
        .{ .expected_type = .ident, .expected_literal = "x" },
        .{ .expected_type = .comma, .expected_literal = "," },
        .{ .expected_type = .ident, .expected_literal = "y" },
        .{ .expected_type = .rparen, .expected_literal = ")" },
        .{ .expected_type = .lbrace, .expected_literal = "{" },
        .{ .expected_type = .ident, .expected_literal = "x" },
        .{ .expected_type = .plus, .expected_literal = "+" },
        .{ .expected_type = .ident, .expected_literal = "y" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        .{ .expected_type = .rbrace, .expected_literal = "}" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        // let result = add(five, ten);
        .{ .expected_type = .let_, .expected_literal = "let" },
        .{ .expected_type = .ident, .expected_literal = "result" },
        .{ .expected_type = .assign, .expected_literal = "=" },
        .{ .expected_type = .ident, .expected_literal = "add" },
        .{ .expected_type = .lparen, .expected_literal = "(" },
        .{ .expected_type = .ident, .expected_literal = "five" },
        .{ .expected_type = .comma, .expected_literal = "," },
        .{ .expected_type = .ident, .expected_literal = "ten" },
        .{ .expected_type = .rparen, .expected_literal = ")" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        // !-/*5;
        .{ .expected_type = .bang, .expected_literal = "!" },
        .{ .expected_type = .minus, .expected_literal = "-" },
        .{ .expected_type = .slash, .expected_literal = "/" },
        .{ .expected_type = .asterisk, .expected_literal = "*" },
        .{ .expected_type = .int, .expected_literal = "5" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        // 5 < 10 > 5;
        .{ .expected_type = .int, .expected_literal = "5" },
        .{ .expected_type = .lt, .expected_literal = "<" },
        .{ .expected_type = .int, .expected_literal = "10" },
        .{ .expected_type = .gt, .expected_literal = ">" },
        .{ .expected_type = .int, .expected_literal = "5" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        // if (5 < 10) { return true; } else { return false; }
        .{ .expected_type = .if_, .expected_literal = "if" },
        .{ .expected_type = .lparen, .expected_literal = "(" },
        .{ .expected_type = .int, .expected_literal = "5" },
        .{ .expected_type = .lt, .expected_literal = "<" },
        .{ .expected_type = .int, .expected_literal = "10" },
        .{ .expected_type = .rparen, .expected_literal = ")" },
        .{ .expected_type = .lbrace, .expected_literal = "{" },
        .{ .expected_type = .return_, .expected_literal = "return" },
        .{ .expected_type = .true_, .expected_literal = "true" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        .{ .expected_type = .rbrace, .expected_literal = "}" },
        .{ .expected_type = .else_, .expected_literal = "else" },
        .{ .expected_type = .lbrace, .expected_literal = "{" },
        .{ .expected_type = .return_, .expected_literal = "return" },
        .{ .expected_type = .false_, .expected_literal = "false" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        .{ .expected_type = .rbrace, .expected_literal = "}" },
        // 10 == 10;
        .{ .expected_type = .int, .expected_literal = "10" },
        .{ .expected_type = .eq, .expected_literal = "==" },
        .{ .expected_type = .int, .expected_literal = "10" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        // 10 != 9;
        .{ .expected_type = .int, .expected_literal = "10" },
        .{ .expected_type = .not_eq, .expected_literal = "!=" },
        .{ .expected_type = .int, .expected_literal = "9" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        // "foobar"
        .{ .expected_type = .string, .expected_literal = "foobar" },
        // "foo bar"
        .{ .expected_type = .string, .expected_literal = "foo bar" },
        // [1, 2];
        .{ .expected_type = .lbracket, .expected_literal = "[" },
        .{ .expected_type = .int, .expected_literal = "1" },
        .{ .expected_type = .comma, .expected_literal = "," },
        .{ .expected_type = .int, .expected_literal = "2" },
        .{ .expected_type = .rbracket, .expected_literal = "]" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        // {"foo": "bar"}
        .{ .expected_type = .lbrace, .expected_literal = "{" },
        .{ .expected_type = .string, .expected_literal = "foo" },
        .{ .expected_type = .colon, .expected_literal = ":" },
        .{ .expected_type = .string, .expected_literal = "bar" },
        .{ .expected_type = .rbrace, .expected_literal = "}" },
        // EOF
        .{ .expected_type = .eof, .expected_literal = "" },
    };

    for (expected) |tt| {
        const tok = l.nextToken();
        try std.testing.expectEqual(tt.expected_type, tok.type);
        try std.testing.expectEqualStrings(tt.expected_literal, tok.literal);
    }
}

Verify it works
#

Run zig build test. No output means all tests passed.

In chapter 2 we build the parser, which takes these tokens and turns them into an Abstract Syntax Tree.

Token definition (token.zig) #

Complete token.zig #

Lexer (lexer.zig) #

Complete lexer.zig #

Tests #

Basic token test #

Comprehensive test #

Verify it works #