LastMUD/internal/command/tokenizer.go

166 lines
3.4 KiB
Go
Raw Normal View History

2025-06-19 16:22:55 +03:00
package command
2025-06-16 14:59:51 +03:00
2025-06-17 10:53:14 +03:00
import (
"fmt"
"regexp"
)
2025-06-17 08:37:04 +03:00
2025-06-17 10:53:14 +03:00
type TokenType byte
2025-06-17 08:37:04 +03:00
const (
2025-06-17 10:53:14 +03:00
TokenEOF TokenType = iota
2025-06-17 08:37:04 +03:00
TokenUnknown
TokenNumber
TokenDecimal
TokenIdentifier
TokenBracketedIdentifier
2025-06-17 10:53:14 +03:00
TokenText
2025-06-17 08:37:04 +03:00
TokenDirection
TokenCommand
TokenSelf
2025-06-17 10:53:14 +03:00
TokenWhitespace
2025-06-16 14:59:51 +03:00
)
2025-06-17 10:53:14 +03:00
func (tt TokenType) String() string {
switch tt {
case TokenEOF:
return "EOF"
case TokenUnknown:
return "Unknown"
case TokenNumber:
return "Number"
case TokenDecimal:
return "Decimal"
case TokenIdentifier:
return "Identifier"
case TokenBracketedIdentifier:
return "BracketedIdentifier"
case TokenText:
return "Text"
case TokenDirection:
return "Direction"
case TokenCommand:
return "Command"
case TokenSelf:
return "Self"
case TokenWhitespace:
return "Whitespace"
default:
return fmt.Sprintf("TokenType(%d)", byte(tt))
}
2025-06-17 08:37:04 +03:00
}
2025-06-16 14:59:51 +03:00
2025-06-17 08:37:04 +03:00
type Token struct {
token TokenType
lexeme string
index int
}
2025-06-16 14:59:51 +03:00
2025-06-17 08:37:04 +03:00
func CreateToken(token TokenType, lexeme string, index int) Token {
return Token{
token: token,
lexeme: lexeme,
index: index,
}
}
2025-06-16 14:59:51 +03:00
2025-06-17 08:37:04 +03:00
func (t Token) Token() TokenType {
return t.token
}
2025-06-16 14:59:51 +03:00
2025-06-17 08:37:04 +03:00
func (t Token) Lexeme() string {
return t.lexeme
}
2025-06-16 14:59:51 +03:00
2025-06-17 08:37:04 +03:00
func (t Token) Index() int {
return t.index
}
2025-06-16 14:59:51 +03:00
2025-06-17 10:53:14 +03:00
func (t Token) String() string {
return fmt.Sprintf("%3d %16v: %q", t.index, t.token, t.lexeme)
}
type tokenPattern struct {
tokenType TokenType
pattern string
}
2025-06-18 17:04:06 +03:00
// Used to tokenize a string input.
// This is the starting point for parsing a command string.
// Create with [CreateTokenizer]
type Tokenizer struct {
2025-06-17 10:53:14 +03:00
tokenPatterns []tokenPattern
2025-06-17 08:37:04 +03:00
}
2025-06-18 17:04:06 +03:00
func CreateTokenizer() *Tokenizer {
return &Tokenizer{
2025-06-17 10:53:14 +03:00
tokenPatterns: []tokenPattern{
{tokenType: TokenDecimal, pattern: `\b\d+\.\d+\b`},
{tokenType: TokenNumber, pattern: `\b\d+\b`},
{tokenType: TokenDirection, pattern: `\b(north|south|east|west|up|down)\b`},
{tokenType: TokenBracketedIdentifier, pattern: `\[[ a-zA-Z0-9'-][ a-zA-Z0-9'-]*\]`},
{tokenType: TokenSelf, pattern: `\bself\b`},
{tokenType: TokenIdentifier, pattern: `\b[a-zA-Z'-][a-zA-Z0-9'-]*\b`},
{tokenType: TokenWhitespace, pattern: `\s+`},
2025-06-22 22:59:46 +03:00
{tokenType: TokenUnknown, pattern: `[^ \t\n\r\f\v]+`},
2025-06-17 10:53:14 +03:00
},
2025-06-16 14:59:51 +03:00
}
2025-06-17 08:37:04 +03:00
}
2025-06-16 14:59:51 +03:00
2025-06-18 17:04:06 +03:00
// Tokenize a command string
func (t *Tokenizer) Tokenize(commandString string) (tokens []Token, err error) {
2025-06-17 08:37:04 +03:00
tokens = []Token{}
pos := 0
2025-06-18 17:04:06 +03:00
inputLen := len(commandString)
2025-06-17 08:37:04 +03:00
2025-06-17 10:53:14 +03:00
// Continue iterating until we reach the end of the input
2025-06-17 08:37:04 +03:00
for pos < inputLen {
matched := false
2025-06-18 17:04:06 +03:00
remaining := commandString[pos:]
2025-06-17 10:53:14 +03:00
// Iterate through each token type and test its pattern
for _, pattern := range t.tokenPatterns {
// All patterns are case-insensitive and must match the beginning of the input (^)
re, regexError := regexp.Compile(`(?i)^` + pattern.pattern)
2025-06-17 08:37:04 +03:00
2025-06-17 16:17:00 +03:00
// If we encounter a regex error, stop tokenization ( wrongly defined pattern? )
2025-06-17 10:53:14 +03:00
if regexError != nil {
tokens = nil
err = regexError
return
}
2025-06-17 08:37:04 +03:00
2025-06-18 17:04:06 +03:00
// If the location of the match isn't nil, that means we've found a match
2025-06-17 10:53:14 +03:00
if loc := re.FindStringIndex(remaining); loc != nil {
lexeme := remaining[loc[0]:loc[1]]
pos += loc[1]
matched = true
2025-06-22 22:59:46 +03:00
// Skip whitespace
if pattern.tokenType == TokenWhitespace {
break
}
2025-06-17 10:53:14 +03:00
tokens = append(tokens, CreateToken(pattern.tokenType, lexeme, pos))
break
}
}
2025-06-18 17:04:06 +03:00
// Unknown tokens are still added
2025-06-17 10:53:14 +03:00
if !matched {
2025-06-18 17:04:06 +03:00
tokens = append(tokens, CreateToken(TokenUnknown, commandString[pos:pos+1], pos))
2025-06-17 10:53:14 +03:00
pos++
2025-06-17 08:37:04 +03:00
}
}
2025-06-17 10:53:14 +03:00
// Mark the end of the tokens
tokens = append(tokens, CreateToken(TokenEOF, "", pos))
return
2025-06-16 14:59:51 +03:00
}