LastMUD/internal/command/tokenizer.go

package command

import (
	"fmt"
	"regexp"
)

type TokenType byte

const (
	TokenEOF TokenType = iota

	TokenUnknown

	TokenNumber
	TokenDecimal
	TokenIdentifier
	TokenBracketedIdentifier
	TokenText

	TokenDirection
	TokenCommand
	TokenSelf

	TokenWhitespace
)

func (tt TokenType) String() string {
	switch tt {
	case TokenEOF:
		return "EOF"
	case TokenUnknown:
		return "Unknown"
	case TokenNumber:
		return "Number"
	case TokenDecimal:
		return "Decimal"
	case TokenIdentifier:
		return "Identifier"
	case TokenBracketedIdentifier:
		return "BracketedIdentifier"
	case TokenText:
		return "Text"
	case TokenDirection:
		return "Direction"
	case TokenCommand:
		return "Command"
	case TokenSelf:
		return "Self"
	case TokenWhitespace:
		return "Whitespace"
	default:
		return fmt.Sprintf("TokenType(%d)", byte(tt))
	}
}

type Token struct {
	token  TokenType
	lexeme string
	index  int
}

func CreateToken(token TokenType, lexeme string, index int) Token {
	return Token{
		token:  token,
		lexeme: lexeme,
		index:  index,
	}
}

func (t Token) Token() TokenType {
	return t.token
}

func (t Token) Lexeme() string {
	return t.lexeme
}

func (t Token) Index() int {
	return t.index
}

func (t Token) String() string {
	return fmt.Sprintf("%3d %16v: %q", t.index, t.token, t.lexeme)
}

type tokenPattern struct {
	tokenType TokenType
	pattern   string
}

// Used to tokenize a string input.
// This is the starting point for parsing a command string.
// Create with [CreateTokenizer]
type Tokenizer struct {
	tokenPatterns []tokenPattern
}

func CreateTokenizer() *Tokenizer {
	return &Tokenizer{
		tokenPatterns: []tokenPattern{
			{tokenType: TokenDecimal, pattern: `\b\d+\.\d+\b`},
			{tokenType: TokenNumber, pattern: `\b\d+\b`},
			{tokenType: TokenDirection, pattern: `\b(north|south|east|west|up|down)\b`},
			{tokenType: TokenBracketedIdentifier, pattern: `\[[ a-zA-Z0-9'-][ a-zA-Z0-9'-]*\]`},
			{tokenType: TokenSelf, pattern: `\bself\b`},
			{tokenType: TokenIdentifier, pattern: `\b[a-zA-Z'-][a-zA-Z0-9'-]*\b`},
			{tokenType: TokenWhitespace, pattern: `\s+`},
			{tokenType: TokenUnknown, pattern: `[^ \t\n\r\f\v]+`},
		},
	}
}

// Tokenize a command string
func (t *Tokenizer) Tokenize(commandString string) (tokens []Token, err error) {
	tokens = []Token{}
	pos := 0
	inputLen := len(commandString)

	// Continue iterating until we reach the end of the input
	for pos < inputLen {
		matched := false
		remaining := commandString[pos:]

		// Iterate through each token type and test its pattern
		for _, pattern := range t.tokenPatterns {
			// All patterns are case-insensitive and must match the beginning of the input (^)
			re, regexError := regexp.Compile(`(?i)^` + pattern.pattern)

			// If we encounter a regex error, stop tokenization ( wrongly defined pattern? )
			if regexError != nil {
				tokens = nil
				err = regexError
				return
			}

			// If the location of the match isn't nil, that means we've found a match
			if loc := re.FindStringIndex(remaining); loc != nil {
				lexeme := remaining[loc[0]:loc[1]]

				pos += loc[1]
				matched = true

				// Skip whitespace
				if pattern.tokenType == TokenWhitespace {
					break
				}

				tokens = append(tokens, CreateToken(pattern.tokenType, lexeme, pos))
				break
			}
		}

		// Unknown tokens are still added
		if !matched {
			tokens = append(tokens, CreateToken(TokenUnknown, commandString[pos:pos+1], pos))
			pos++
		}
	}

	// Mark the end of the tokens
	tokens = append(tokens, CreateToken(TokenEOF, "", pos))

	return
}