From adbeabc5d490755d356831d3a01a05d8b0d56580 Mon Sep 17 00:00:00 2001 From: mvvasilev Date: Tue, 17 Jun 2025 08:37:04 +0300 Subject: [PATCH] Start tokenizer --- src/CommandLib/tokenizer.go | 103 +++++++++++++++++++++++++++--------- 1 file changed, 78 insertions(+), 25 deletions(-) diff --git a/src/CommandLib/tokenizer.go b/src/CommandLib/tokenizer.go index dd7dcc8..b194e8b 100644 --- a/src/CommandLib/tokenizer.go +++ b/src/CommandLib/tokenizer.go @@ -1,30 +1,83 @@ package commandlib -import ( - "strconv" - "strings" +import "strings" + +type TokenType = byte + +const ( + TokenEOF = iota + + TokenUnknown + + TokenNumber + TokenDecimal + TokenIdentifier + TokenBracketedIdentifier + + TokenDirection + TokenCommand + TokenSelf + + TokenPunctuation ) -func Tokenize(commandMsg string) []any { - split := strings.Split(commandMsg, " ") - - tokens := []any{} - - for _, v := range split { - valInt, err := strconv.ParseInt(v, 10, 32) - - if err == nil { - tokens = append(tokens, valInt) - } - - valFloat, err := strconv.ParseFloat(v, 32) - - if err == nil { - tokens = append(tokens, valFloat) - } - - tokens = append(tokens, v) - } - - return tokens +var tokenPatterns = map[TokenType]string{ + TokenNumber: `\b\d+\b`, + TokenDecimal: `\b\d+\.\d+\b`, + TokenIdentifier: `\b[a-zA-Z][a-zA-Z0-9]*\b`, + TokenBracketedIdentifier: `\[[a-zA-Z][a-zA-Z0-9]*\]`, + TokenDirection: `\b(north|south|east|west|up|down)\b`, + TokenSelf: `\bself\b`, + TokenPunctuation: `[,.!?'/":;\-\[\]\(\)]`, + TokenUnknown: `.`, +} + +type Token struct { + token TokenType + lexeme string + index int +} + +func CreateToken(token TokenType, lexeme string, index int) Token { + return Token{ + token: token, + lexeme: lexeme, + index: index, + } +} + +func (t Token) Token() TokenType { + return t.token +} + +func (t Token) Lexeme() string { + return t.lexeme +} + +func (t Token) Index() int { + return t.index +} + +type tokenizer struct { + commandNameTokenRegex string +} + +func CreateTokenizer(commandNames []string) *tokenizer { + return &tokenizer{ + commandNameTokenRegex: `\b(` + strings.Join(commandNames, "|") + `)\b`, + } +} + +func (t *tokenizer) Tokenize(commandMsg string) (tokens []Token) { + tokens = []Token{} + pos := 0 + inputLen := len(commandMsg) + + for pos < inputLen { + matched := false + + for tokenType, pattern := range tokenPatterns { + + } + } }