LastMUD/internal/command/tokenizer.go

package command

import (
	"fmt"
	"regexp"
)

type TokenType byte

const (
	TokenEOF TokenType = iota

	TokenUnknown

	TokenNumber
	TokenDecimal
	TokenIdentifier
	TokenBracketedIdentifier
	TokenText

	TokenDirection
	TokenCommand
	TokenSelf

	TokenWhitespace
)

func (tt TokenType) String() string {
	switch tt {
	case TokenEOF:
		return "EOF"
	case TokenUnknown:
		return "Unknown"
	case TokenNumber:
		return "Number"
	case TokenDecimal:
		return "Decimal"
	case TokenIdentifier:
		return "Identifier"
	case TokenBracketedIdentifier:
		return "BracketedIdentifier"
	case TokenText:
		return "Text"
	case TokenDirection:
		return "Direction"
	case TokenCommand:
		return "Command"
	case TokenSelf:
		return "Self"
	case TokenWhitespace:
		return "Whitespace"
	default:
		return fmt.Sprintf("TokenType(%d)", byte(tt))
	}
}

type Token struct {
	token  TokenType
	lexeme string
	index  int
}

func CreateToken(token TokenType, lexeme string, index int) Token {
	return Token{
		token:  token,
		lexeme: lexeme,
		index:  index,
	}
}

func (t Token) Token() TokenType {
	return t.token
}

func (t Token) Lexeme() string {
	return t.lexeme
}

func (t Token) Index() int {
	return t.index
}

func (t Token) String() string {
	return fmt.Sprintf("%3d %16v: %q", t.index, t.token, t.lexeme)
}

type tokenPattern struct {
	tokenType TokenType
	pattern   string
}

// Used to tokenize a string input.
// This is the starting point for parsing a command string.
// Create with [CreateTokenizer]
type Tokenizer struct {
	tokenPatterns []tokenPattern
}

func CreateTokenizer() *Tokenizer {
	return &Tokenizer{
		tokenPatterns: []tokenPattern{
			{tokenType: TokenDecimal, pattern: `\b\d+\.\d+\b`},
			{tokenType: TokenNumber, pattern: `\b\d+\b`},
			{tokenType: TokenDirection, pattern: `\b(north|south|east|west|up|down)\b`},
			{tokenType: TokenBracketedIdentifier, pattern: `\[[ a-zA-Z0-9'-][ a-zA-Z0-9'-]*\]`},
			{tokenType: TokenSelf, pattern: `\bself\b`},
			{tokenType: TokenIdentifier, pattern: `\b[a-zA-Z'-][a-zA-Z0-9'-]*\b`},
			{tokenType: TokenWhitespace, pattern: `\s+`},
			{tokenType: TokenUnknown, pattern: `[^ \t\n\r\f\v]+`},
		},
	}
}

// Tokenize a command string
func (t *Tokenizer) Tokenize(commandString string) (tokens []Token, err error) {
	tokens = []Token{}
	pos := 0
	inputLen := len(commandString)

	// Continue iterating until we reach the end of the input
	for pos < inputLen {
		matched := false
		remaining := commandString[pos:]

		// Iterate through each token type and test its pattern
		for _, pattern := range t.tokenPatterns {
			// All patterns are case-insensitive and must match the beginning of the input (^)
			re, regexError := regexp.Compile(`(?i)^` + pattern.pattern)

			// If we encounter a regex error, stop tokenization ( wrongly defined pattern? )
			if regexError != nil {
				tokens = nil
				err = regexError
				return
			}

			// If the location of the match isn't nil, that means we've found a match
			if loc := re.FindStringIndex(remaining); loc != nil {
				lexeme := remaining[loc[0]:loc[1]]

				pos += loc[1]
				matched = true

				// Skip whitespace
				if pattern.tokenType == TokenWhitespace {
					break
				}

				tokens = append(tokens, CreateToken(pattern.tokenType, lexeme, pos))
				break
			}
		}

		// Unknown tokens are still added
		if !matched {
			tokens = append(tokens, CreateToken(TokenUnknown, commandString[pos:pos+1], pos))
			pos++
		}
	}

	// Mark the end of the tokens
	tokens = append(tokens, CreateToken(TokenEOF, "", pos))

	return
}
TCP server works 2025-06-19 16:22:55 +03:00			`package command`
Initial 2025-06-16 14:59:51 +03:00
Finish tokenizer 2025-06-17 10:53:14 +03:00			`import (`
			`"fmt"`
			`"regexp"`
			`)`
Start tokenizer 2025-06-17 08:37:04 +03:00
Finish tokenizer 2025-06-17 10:53:14 +03:00			`type TokenType byte`
Start tokenizer 2025-06-17 08:37:04 +03:00
			`const (`
Finish tokenizer 2025-06-17 10:53:14 +03:00			`TokenEOF TokenType = iota`
Start tokenizer 2025-06-17 08:37:04 +03:00
			`TokenUnknown`

			`TokenNumber`
			`TokenDecimal`
			`TokenIdentifier`
			`TokenBracketedIdentifier`
Finish tokenizer 2025-06-17 10:53:14 +03:00			`TokenText`
Start tokenizer 2025-06-17 08:37:04 +03:00
			`TokenDirection`
			`TokenCommand`
			`TokenSelf`

Finish tokenizer 2025-06-17 10:53:14 +03:00			`TokenWhitespace`
Initial 2025-06-16 14:59:51 +03:00			`)`

Finish tokenizer 2025-06-17 10:53:14 +03:00			`func (tt TokenType) String() string {`
			`switch tt {`
			`case TokenEOF:`
			`return "EOF"`
			`case TokenUnknown:`
			`return "Unknown"`
			`case TokenNumber:`
			`return "Number"`
			`case TokenDecimal:`
			`return "Decimal"`
			`case TokenIdentifier:`
			`return "Identifier"`
			`case TokenBracketedIdentifier:`
			`return "BracketedIdentifier"`
			`case TokenText:`
			`return "Text"`
			`case TokenDirection:`
			`return "Direction"`
			`case TokenCommand:`
			`return "Command"`
			`case TokenSelf:`
			`return "Self"`
			`case TokenWhitespace:`
			`return "Whitespace"`
			`default:`
			`return fmt.Sprintf("TokenType(%d)", byte(tt))`
			`}`
Start tokenizer 2025-06-17 08:37:04 +03:00			`}`
Initial 2025-06-16 14:59:51 +03:00
Start tokenizer 2025-06-17 08:37:04 +03:00			`type Token struct {`
			`token TokenType`
			`lexeme string`
			`index int`
			`}`
Initial 2025-06-16 14:59:51 +03:00
Start tokenizer 2025-06-17 08:37:04 +03:00			`func CreateToken(token TokenType, lexeme string, index int) Token {`
			`return Token{`
			`token: token,`
			`lexeme: lexeme,`
			`index: index,`
			`}`
			`}`
Initial 2025-06-16 14:59:51 +03:00
Start tokenizer 2025-06-17 08:37:04 +03:00			`func (t Token) Token() TokenType {`
			`return t.token`
			`}`
Initial 2025-06-16 14:59:51 +03:00
Start tokenizer 2025-06-17 08:37:04 +03:00			`func (t Token) Lexeme() string {`
			`return t.lexeme`
			`}`
Initial 2025-06-16 14:59:51 +03:00
Start tokenizer 2025-06-17 08:37:04 +03:00			`func (t Token) Index() int {`
			`return t.index`
			`}`
Initial 2025-06-16 14:59:51 +03:00
Finish tokenizer 2025-06-17 10:53:14 +03:00			`func (t Token) String() string {`
			`return fmt.Sprintf("%3d %16v: %q", t.index, t.token, t.lexeme)`
			`}`

			`type tokenPattern struct {`
			`tokenType TokenType`
			`pattern string`
			`}`

CommandLib more or less finished 2025-06-18 17:04:06 +03:00			`// Used to tokenize a string input.`
			`// This is the starting point for parsing a command string.`
			`// Create with [CreateTokenizer]`
			`type Tokenizer struct {`
Finish tokenizer 2025-06-17 10:53:14 +03:00			`tokenPatterns []tokenPattern`
Start tokenizer 2025-06-17 08:37:04 +03:00			`}`

CommandLib more or less finished 2025-06-18 17:04:06 +03:00			`func CreateTokenizer() *Tokenizer {`
			`return &Tokenizer{`
Finish tokenizer 2025-06-17 10:53:14 +03:00			`tokenPatterns: []tokenPattern{`
			{tokenType: TokenDecimal, pattern: `\b\d+\.\d+\b`},
			{tokenType: TokenNumber, pattern: `\b\d+\b`},
			{tokenType: TokenDirection, pattern: `\b(north\|south\|east\|west\|up\|down)\b`},
			{tokenType: TokenBracketedIdentifier, pattern: `\[[ a-zA-Z0-9'-][ a-zA-Z0-9'-]*\]`},
			{tokenType: TokenSelf, pattern: `\bself\b`},
			{tokenType: TokenIdentifier, pattern: `\b[a-zA-Z'-][a-zA-Z0-9'-]*\b`},
			{tokenType: TokenWhitespace, pattern: `\s+`},
Better feedback 2025-06-22 22:59:46 +03:00			{tokenType: TokenUnknown, pattern: `[^ \t\n\r\f\v]+`},
Finish tokenizer 2025-06-17 10:53:14 +03:00			`},`
Initial 2025-06-16 14:59:51 +03:00			`}`
Start tokenizer 2025-06-17 08:37:04 +03:00			`}`
Initial 2025-06-16 14:59:51 +03:00
CommandLib more or less finished 2025-06-18 17:04:06 +03:00			`// Tokenize a command string`
			`func (t *Tokenizer) Tokenize(commandString string) (tokens []Token, err error) {`
Start tokenizer 2025-06-17 08:37:04 +03:00			`tokens = []Token{}`
			`pos := 0`
CommandLib more or less finished 2025-06-18 17:04:06 +03:00			`inputLen := len(commandString)`
Start tokenizer 2025-06-17 08:37:04 +03:00
Finish tokenizer 2025-06-17 10:53:14 +03:00			`// Continue iterating until we reach the end of the input`
Start tokenizer 2025-06-17 08:37:04 +03:00			`for pos < inputLen {`
			`matched := false`
CommandLib more or less finished 2025-06-18 17:04:06 +03:00			`remaining := commandString[pos:]`
Finish tokenizer 2025-06-17 10:53:14 +03:00
			`// Iterate through each token type and test its pattern`
			`for _, pattern := range t.tokenPatterns {`
			`// All patterns are case-insensitive and must match the beginning of the input (^)`
			re, regexError := regexp.Compile(`(?i)^` + pattern.pattern)
Start tokenizer 2025-06-17 08:37:04 +03:00
continue commands 2025-06-17 16:17:00 +03:00			`// If we encounter a regex error, stop tokenization ( wrongly defined pattern? )`
Finish tokenizer 2025-06-17 10:53:14 +03:00			`if regexError != nil {`
			`tokens = nil`
			`err = regexError`
			`return`
			`}`
Start tokenizer 2025-06-17 08:37:04 +03:00
CommandLib more or less finished 2025-06-18 17:04:06 +03:00			`// If the location of the match isn't nil, that means we've found a match`
Finish tokenizer 2025-06-17 10:53:14 +03:00			`if loc := re.FindStringIndex(remaining); loc != nil {`
			`lexeme := remaining[loc[0]:loc[1]]`

			`pos += loc[1]`
			`matched = true`

Better feedback 2025-06-22 22:59:46 +03:00			`// Skip whitespace`
			`if pattern.tokenType == TokenWhitespace {`
			`break`
			`}`

Finish tokenizer 2025-06-17 10:53:14 +03:00			`tokens = append(tokens, CreateToken(pattern.tokenType, lexeme, pos))`
			`break`
			`}`
			`}`

CommandLib more or less finished 2025-06-18 17:04:06 +03:00			`// Unknown tokens are still added`
Finish tokenizer 2025-06-17 10:53:14 +03:00			`if !matched {`
CommandLib more or less finished 2025-06-18 17:04:06 +03:00			`tokens = append(tokens, CreateToken(TokenUnknown, commandString[pos:pos+1], pos))`
Finish tokenizer 2025-06-17 10:53:14 +03:00			`pos++`
Start tokenizer 2025-06-17 08:37:04 +03:00			`}`
			`}`
Finish tokenizer 2025-06-17 10:53:14 +03:00
			`// Mark the end of the tokens`
			`tokens = append(tokens, CreateToken(TokenEOF, "", pos))`

			`return`
Initial 2025-06-16 14:59:51 +03:00			`}`