Integrating Logos with LALRPOP for Custom Lexing

Building a lexer from scratch can be complex and error-prone. Instead, leveraging existing libraries like Logos simplifies tokenization significantly. This guide demonstrates how to integrate Logos as the lexer for a toy language parsed with LALRPOP.

Consider the follownig input:

var a = 42;
var b = 23;

# a comment
print (a - b);

Depandency Setup

Add Logos to your Cargo.toml:

logos = "0.14"

Abstract Syntax Tree

Define the AST to represent parsed constructs:

#[derive(Clone, Debug, PartialEq)]
pub enum Statement {
    Variable { name: String, value: Box<Expression> },
    Print { value: Box<Expression> },
}

#[derive(Clone, Debug, PartialEq)]
pub enum Expression {
    Integer(i64),
    Variable(String),
    BinaryOperation {
        lhs: Box<Expression>,
        operator: Operator,
        rhs: Box<Expression>,
    },
}

#[derive(Clone, Debug, PartialEq)]
pub enum Operator {
    Add,
    Sub,
    Mul,
    Div,
}

Token Definition

In tokens.rs, define tokens and error handling:

use std::fmt;
use std::num::ParseIntError;
use logos::Logos;

#[derive(Default, Debug, Clone, PartialEq)]
pub enum LexicalError {
    InvalidInteger(ParseIntError),
    #[default]
    InvalidToken,
}

impl From<ParseIntError> for LexicalError {
    fn from(err: ParseIntError) -> Self {
        LexicalError::InvalidInteger(err)
    }
}

#[derive(Logos, Clone, Debug, PartialEq)]
#[logos(skip r"[ \t\n\f]+", skip r"#.*\n?", error = LexicalError)]
pub enum Token {
    #[token("var")]
    KeywordVar,
    #[token("print")]
    KeywordPrint,

    #[regex("[_a-zA-Z][_0-9a-zA-Z]*", |lex| lex.slice().to_string())]
    Identifier(String),
    #[regex("[1-9][0-9]*", |lex| lex.slice().parse())]
    Integer(i64),

    #[token("(")]
    LParen,
    #[token(")")]
    RParen,
    #[token("=")]
    Assign,
    #[token(";")]
    Semicolon,

    #[token("+")]
    OperatorAdd,
    #[token("-")]
    OperatorSub,
    #[token("*")]
    OperatorMul,
    #[token("/")]
    OperatorDiv,
}

impl fmt::Display for Token {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "{:#?}", self)
    }
}

Logos resolves ambiguities by preferring longer matches over shorter ones and specific patterns over generic ones. For example, "printa" is recognized as an Identifier, not as KeywordPrint followed by "a".

Lexer Implementation

Create lexer.rs to bridge Logos with LALRPOP:

use logos::{Logos, SpannedIter};
use crate::tokens::{Token, LexicalError};

pub type Spanned<Tok, Loc, Error> = Result<(Loc, Tok, Loc), Error>;

pub struct Lexer<'input> {
    token_stream: SpannedIter<'input, Token>,
}

impl<'input> Lexer<'input> {
    pub fn new(input: &'input str) -> Self {
        Self {
            token_stream: Token::lexer(input).spanned(),
        }
    }
}

impl<'input> Iterator for Lexer<'input> {
    type Item = Spanned<Token, usize, LexicalError>;

    fn next(&mut self) -> Option<Self::Item> {
        self.token_stream
            .next()
            .map(|(token, span)| Ok((span.start, token?, span.end)))
    }
}

Grammar Integration

In your .lalrpop file, declare external token mappings:

use crate::tokens::{Token, LexicalError};
use crate::ast;

grammar;

extern {
    type Location = usize;
    type Error = LexicalError;

    enum Token {
        "var" => Token::KeywordVar,
        "print" => Token::KeywordPrint,
        "identifier" => Token::Identifier(<String>),
        "int" => Token::Integer(<i64>),
        "(" => Token::LParen,
        ")" => Token::RParen,
        "=" => Token::Assign,
        ";" => Token::Semicolon,
        "+" => Token::OperatorAdd,
        "-" => Token::OperatorSub,
        "*" => Token::OperatorMul,
        "/" => Token::OperatorDiv,
    }
}

pub Script: Vec<ast::Statement> = {
    <stmts:Statement*> => stmts
};

pub Statement: ast::Statement = {
    "var" <name:"identifier"> "=" <value:Expression> ";" => {
        ast::Statement::Variable { name, value }
    },
    "print" <value:Expression> ";" => {
        ast::Statement::Print { value }
    },
};

pub Expression: Box<ast::Expression> = {
    #[precedence(level="1")]
    Term,

    #[precedence(level="2")] #[assoc(side="left")]
    <lhs:Expression> "*" <rhs:Expression> => {
        Box::new(ast::Expression::BinaryOperation {
            lhs,
            operator: ast::Operator::Mul,
            rhs,
        })
    },
    <lhs:Expression> "/" <rhs:Expression> => {
        Box::new(ast::Expression::BinaryOperation {
            lhs,
            operator: ast::Operator::Div,
            rhs,
        })
    },

    #[precedence(level="3")] #[assoc(side="left")]
    <lhs:Expression> "+" <rhs:Expression> => {
        Box::new(ast::Expression::BinaryOperation {
            lhs,
            operator: ast::Operator::Add,
            rhs,
        })
    },
    <lhs:Expression> "-" <rhs:Expression> => {
        Box::new(ast::Expression::BinaryOperation {
            lhs,
            operator: ast::Operator::Sub,
            rhs,
        })
    },
};

pub Term: Box<ast::Expression> = {
    <val:"int"> => Box::new(ast::Expression::Integer(val)),
    <name:"identifier"> => Box::new(ast::Expression::Variable(name)),
    "(" <Expression> ")",
};

Parsing Input

Use the parser as follows:

let source = std::fs::read_to_string("myscript.toy")?;
let lexer = Lexer::new(&source);
let ast = ScriptParser::new().parse(lexer)?;
println!("{:#?}", ast);

Tags: rust lalrpop logos parsing lexer

Posted on Wed, 27 May 2026 22:45:42 +0000 by sloth456