The lexer (also called a tokenizer or scanner) takes Monkey source code as a string and breaks it into a sequence of tokens. Each token is the smallest meaningful unit of the language: a keyword, an identifier, a number, an operator.
This chapter builds the lexer across two files: token.zig for token types and keyword lookup, and lexer.zig for the lexer itself.
Token definition (token.zig)
#
We use an enum for token types. Type safe and fast since it compiles down to integer comparison instead of string comparison.
Complete token.zig
#
const std = @import("std");
pub const TokenType = enum {
// Special.
illegal,
eof,
// Identifiers and literals.
ident,
int,
string,
// Operators.
assign,
plus,
minus,
bang,
asterisk,
slash,
lt,
gt,
eq,
not_eq,
// Delimiters.
comma,
semicolon,
colon,
lparen,
rparen,
lbrace,
rbrace,
lbracket,
rbracket,
// Keywords.
function,
let_,
true_,
false_,
if_,
else_,
return_,
};
pub const Token = struct {
type: TokenType,
literal: []const u8,
};
/// Checks if an identifier is a keyword.
pub fn lookupIdent(ident: []const u8) TokenType {
const keywords = std.StaticStringMap(TokenType).initComptime(.{
.{ "fn", .function },
.{ "let", .let_ },
.{ "true", .true_ },
.{ "false", .false_ },
.{ "if", .if_ },
.{ "else", .else_ },
.{ "return", .return_ },
});
return keywords.get(ident) orelse .ident;
}
test "lookupIdent keywords" {
try std.testing.expectEqual(TokenType.let_, lookupIdent("let"));
try std.testing.expectEqual(TokenType.function, lookupIdent("fn"));
try std.testing.expectEqual(TokenType.return_, lookupIdent("return"));
try std.testing.expectEqual(TokenType.ident, lookupIdent("foobar"));
try std.testing.expectEqual(TokenType.ident, lookupIdent("x"));
}
Trailing underscores:
let,true,false,if,else,returnare Zig keywords, so we append_to avoid conflicts.
Lexer (lexer.zig)
#
The lexer holds a reference to the input string and tracks its position as it scans through characters.
Complete lexer.zig
#
const std = @import("std");
const token = @import("token.zig");
pub const Lexer = struct {
input: []const u8,
position: usize,
read_position: usize,
ch: u8,
/// Creates a new lexer for a given input string.
pub fn init(input: []const u8) Lexer {
var l = Lexer{
.input = input,
.position = 0,
.read_position = 0,
.ch = 0,
};
l.readChar();
return l;
}
/// Advances to the next character in the input.
fn readChar(self: *Lexer) void {
if (self.read_position >= self.input.len) {
self.ch = 0;
} else {
self.ch = self.input[self.read_position];
}
self.position = self.read_position;
self.read_position += 1;
}
/// Looks at the next character without advancing the position.
fn peekChar(self: *Lexer) u8 {
if (self.read_position >= self.input.len) {
return 0;
}
return self.input[self.read_position];
}
/// Reads the next token from the input.
pub fn nextToken(self: *Lexer) token.Token {
self.skipWhitespace();
const tok: token.Token = switch (self.ch) {
'=' => blk: {
if (self.peekChar() == '=') {
const start = self.position;
self.readChar();
break :blk .{ .type = .eq, .literal = self.input[start .. self.position + 1] };
}
break :blk .{ .type = .assign, .literal = "=" };
},
'+' => .{ .type = .plus, .literal = "+" },
'-' => .{ .type = .minus, .literal = "-" },
'!' => blk: {
if (self.peekChar() == '=') {
const start = self.position;
self.readChar();
break :blk .{ .type = .not_eq, .literal = self.input[start .. self.position + 1] };
}
break :blk .{ .type = .bang, .literal = "!" };
},
'*' => .{ .type = .asterisk, .literal = "*" },
'/' => .{ .type = .slash, .literal = "/" },
'<' => .{ .type = .lt, .literal = "<" },
'>' => .{ .type = .gt, .literal = ">" },
',' => .{ .type = .comma, .literal = "," },
';' => .{ .type = .semicolon, .literal = ";" },
':' => .{ .type = .colon, .literal = ":" },
'(' => .{ .type = .lparen, .literal = "(" },
')' => .{ .type = .rparen, .literal = ")" },
'{' => .{ .type = .lbrace, .literal = "{" },
'}' => .{ .type = .rbrace, .literal = "}" },
'[' => .{ .type = .lbracket, .literal = "[" },
']' => .{ .type = .rbracket, .literal = "]" },
'"' => .{ .type = .string, .literal = self.readString() },
0 => .{ .type = .eof, .literal = "" },
else => {
if (isLetter(self.ch)) {
const literal = self.readIdentifier();
return .{ .type = token.lookupIdent(literal), .literal = literal };
} else if (isDigit(self.ch)) {
return .{ .type = .int, .literal = self.readNumber() };
} else {
return .{ .type = .illegal, .literal = self.input[self.position .. self.position + 1] };
}
},
};
self.readChar();
return tok;
}
/// Reads an identifier (letters and underscores).
fn readIdentifier(self: *Lexer) []const u8 {
const start = self.position;
while (isLetter(self.ch)) {
self.readChar();
}
return self.input[start..self.position];
}
/// Reads a number (Monkey doesn't use floats).
fn readNumber(self: *Lexer) []const u8 {
const start = self.position;
while (isDigit(self.ch)) {
self.readChar();
}
return self.input[start..self.position];
}
/// Reads a string literal between double quotes.
/// Returns the content without the surrounding quotes.
fn readString(self: *Lexer) []const u8 {
const start = self.position + 1; // skip opening quote
while (true) {
self.readChar();
if (self.ch == '"' or self.ch == 0) break;
}
return self.input[start..self.position];
}
/// Skips whitespace characters (spaces, tabs, newlines, carriage returns).
fn skipWhitespace(self: *Lexer) void {
while (self.ch == ' ' or self.ch == '\t' or self.ch == '\n' or self.ch == '\r') {
self.readChar();
}
}
};
fn isLetter(ch: u8) bool {
return (ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z') or ch == '_';
}
fn isDigit(ch: u8) bool {
return ch >= '0' and ch <= '9';
}
Tests #
Basic token test #
test "next token, basic symbols" {
const input = "=+(){},;";
var l = Lexer.init(input);
const expected = [_]struct { expected_type: token.TokenType, expected_literal: []const u8 }{
.{ .expected_type = .assign, .expected_literal = "=" },
.{ .expected_type = .plus, .expected_literal = "+" },
.{ .expected_type = .lparen, .expected_literal = "(" },
.{ .expected_type = .rparen, .expected_literal = ")" },
.{ .expected_type = .lbrace, .expected_literal = "{" },
.{ .expected_type = .rbrace, .expected_literal = "}" },
.{ .expected_type = .comma, .expected_literal = "," },
.{ .expected_type = .semicolon, .expected_literal = ";" },
.{ .expected_type = .eof, .expected_literal = "" },
};
for (expected) |tt| {
const tok = l.nextToken();
try std.testing.expectEqual(tt.expected_type, tok.type);
try std.testing.expectEqualStrings(tt.expected_literal, tok.literal);
}
}
Comprehensive test #
This is the main lexer test from Thorsten Ball’s book. It tokenizes a complete Monkey program and verifies every token.
test "next token, complete program" {
const input =
\\let five = 5;
\\let ten = 10;
\\
\\let add = fn(x, y) {
\\ x + y;
\\};
\\
\\let result = add(five, ten);
\\!-/*5;
\\5 < 10 > 5;
\\
\\if (5 < 10) {
\\ return true;
\\} else {
\\ return false;
\\}
\\
\\10 == 10;
\\10 != 9;
\\"foobar"
\\"foo bar"
\\[1, 2];
\\{"foo": "bar"}
;
var l = Lexer.init(input);
// Comments below explain what code being scanned.
const expected = [_]struct { expected_type: token.TokenType, expected_literal: []const u8 }{
// let five = 5;
.{ .expected_type = .let_, .expected_literal = "let" },
.{ .expected_type = .ident, .expected_literal = "five" },
.{ .expected_type = .assign, .expected_literal = "=" },
.{ .expected_type = .int, .expected_literal = "5" },
.{ .expected_type = .semicolon, .expected_literal = ";" },
// let ten = 10;
.{ .expected_type = .let_, .expected_literal = "let" },
.{ .expected_type = .ident, .expected_literal = "ten" },
.{ .expected_type = .assign, .expected_literal = "=" },
.{ .expected_type = .int, .expected_literal = "10" },
.{ .expected_type = .semicolon, .expected_literal = ";" },
// let add = fn(x, y) { x + y; };
.{ .expected_type = .let_, .expected_literal = "let" },
.{ .expected_type = .ident, .expected_literal = "add" },
.{ .expected_type = .assign, .expected_literal = "=" },
.{ .expected_type = .function, .expected_literal = "fn" },
.{ .expected_type = .lparen, .expected_literal = "(" },
.{ .expected_type = .ident, .expected_literal = "x" },
.{ .expected_type = .comma, .expected_literal = "," },
.{ .expected_type = .ident, .expected_literal = "y" },
.{ .expected_type = .rparen, .expected_literal = ")" },
.{ .expected_type = .lbrace, .expected_literal = "{" },
.{ .expected_type = .ident, .expected_literal = "x" },
.{ .expected_type = .plus, .expected_literal = "+" },
.{ .expected_type = .ident, .expected_literal = "y" },
.{ .expected_type = .semicolon, .expected_literal = ";" },
.{ .expected_type = .rbrace, .expected_literal = "}" },
.{ .expected_type = .semicolon, .expected_literal = ";" },
// let result = add(five, ten);
.{ .expected_type = .let_, .expected_literal = "let" },
.{ .expected_type = .ident, .expected_literal = "result" },
.{ .expected_type = .assign, .expected_literal = "=" },
.{ .expected_type = .ident, .expected_literal = "add" },
.{ .expected_type = .lparen, .expected_literal = "(" },
.{ .expected_type = .ident, .expected_literal = "five" },
.{ .expected_type = .comma, .expected_literal = "," },
.{ .expected_type = .ident, .expected_literal = "ten" },
.{ .expected_type = .rparen, .expected_literal = ")" },
.{ .expected_type = .semicolon, .expected_literal = ";" },
// !-/*5;
.{ .expected_type = .bang, .expected_literal = "!" },
.{ .expected_type = .minus, .expected_literal = "-" },
.{ .expected_type = .slash, .expected_literal = "/" },
.{ .expected_type = .asterisk, .expected_literal = "*" },
.{ .expected_type = .int, .expected_literal = "5" },
.{ .expected_type = .semicolon, .expected_literal = ";" },
// 5 < 10 > 5;
.{ .expected_type = .int, .expected_literal = "5" },
.{ .expected_type = .lt, .expected_literal = "<" },
.{ .expected_type = .int, .expected_literal = "10" },
.{ .expected_type = .gt, .expected_literal = ">" },
.{ .expected_type = .int, .expected_literal = "5" },
.{ .expected_type = .semicolon, .expected_literal = ";" },
// if (5 < 10) { return true; } else { return false; }
.{ .expected_type = .if_, .expected_literal = "if" },
.{ .expected_type = .lparen, .expected_literal = "(" },
.{ .expected_type = .int, .expected_literal = "5" },
.{ .expected_type = .lt, .expected_literal = "<" },
.{ .expected_type = .int, .expected_literal = "10" },
.{ .expected_type = .rparen, .expected_literal = ")" },
.{ .expected_type = .lbrace, .expected_literal = "{" },
.{ .expected_type = .return_, .expected_literal = "return" },
.{ .expected_type = .true_, .expected_literal = "true" },
.{ .expected_type = .semicolon, .expected_literal = ";" },
.{ .expected_type = .rbrace, .expected_literal = "}" },
.{ .expected_type = .else_, .expected_literal = "else" },
.{ .expected_type = .lbrace, .expected_literal = "{" },
.{ .expected_type = .return_, .expected_literal = "return" },
.{ .expected_type = .false_, .expected_literal = "false" },
.{ .expected_type = .semicolon, .expected_literal = ";" },
.{ .expected_type = .rbrace, .expected_literal = "}" },
// 10 == 10;
.{ .expected_type = .int, .expected_literal = "10" },
.{ .expected_type = .eq, .expected_literal = "==" },
.{ .expected_type = .int, .expected_literal = "10" },
.{ .expected_type = .semicolon, .expected_literal = ";" },
// 10 != 9;
.{ .expected_type = .int, .expected_literal = "10" },
.{ .expected_type = .not_eq, .expected_literal = "!=" },
.{ .expected_type = .int, .expected_literal = "9" },
.{ .expected_type = .semicolon, .expected_literal = ";" },
// "foobar"
.{ .expected_type = .string, .expected_literal = "foobar" },
// "foo bar"
.{ .expected_type = .string, .expected_literal = "foo bar" },
// [1, 2];
.{ .expected_type = .lbracket, .expected_literal = "[" },
.{ .expected_type = .int, .expected_literal = "1" },
.{ .expected_type = .comma, .expected_literal = "," },
.{ .expected_type = .int, .expected_literal = "2" },
.{ .expected_type = .rbracket, .expected_literal = "]" },
.{ .expected_type = .semicolon, .expected_literal = ";" },
// {"foo": "bar"}
.{ .expected_type = .lbrace, .expected_literal = "{" },
.{ .expected_type = .string, .expected_literal = "foo" },
.{ .expected_type = .colon, .expected_literal = ":" },
.{ .expected_type = .string, .expected_literal = "bar" },
.{ .expected_type = .rbrace, .expected_literal = "}" },
// EOF
.{ .expected_type = .eof, .expected_literal = "" },
};
for (expected) |tt| {
const tok = l.nextToken();
try std.testing.expectEqual(tt.expected_type, tok.type);
try std.testing.expectEqualStrings(tt.expected_literal, tok.literal);
}
}
Verify it works #
Run zig build test. No output means all tests passed.
In chapter 2 we build the parser, which takes these tokens and turns them into an Abstract Syntax Tree.