From 181f2d2d2cbaae990bb045963b6b9423e737b993 Mon Sep 17 00:00:00 2001 From: Miroslav Vasilev Date: Tue, 17 Jun 2025 10:53:14 +0300 Subject: [PATCH] Finish tokenizer --- src/CommandLib/tokenizer.go | 113 ++++++++++++++++++++++++++++++------ src/Server/main.go | 62 +++++++++++++------- 2 files changed, 134 insertions(+), 41 deletions(-) diff --git a/src/CommandLib/tokenizer.go b/src/CommandLib/tokenizer.go index b194e8b..7a1ef2f 100644 --- a/src/CommandLib/tokenizer.go +++ b/src/CommandLib/tokenizer.go @@ -1,11 +1,14 @@ package commandlib -import "strings" +import ( + "fmt" + "regexp" +) -type TokenType = byte +type TokenType byte const ( - TokenEOF = iota + TokenEOF TokenType = iota TokenUnknown @@ -13,23 +16,45 @@ const ( TokenDecimal TokenIdentifier TokenBracketedIdentifier + TokenText TokenDirection TokenCommand + TokenSayCommand TokenSelf - TokenPunctuation + TokenWhitespace ) -var tokenPatterns = map[TokenType]string{ - TokenNumber: `\b\d+\b`, - TokenDecimal: `\b\d+\.\d+\b`, - TokenIdentifier: `\b[a-zA-Z][a-zA-Z0-9]*\b`, - TokenBracketedIdentifier: `\[[a-zA-Z][a-zA-Z0-9]*\]`, - TokenDirection: `\b(north|south|east|west|up|down)\b`, - TokenSelf: `\bself\b`, - TokenPunctuation: `[,.!?'/":;\-\[\]\(\)]`, - TokenUnknown: `.`, +func (tt TokenType) String() string { + switch tt { + case TokenEOF: + return "EOF" + case TokenUnknown: + return "Unknown" + case TokenNumber: + return "Number" + case TokenDecimal: + return "Decimal" + case TokenIdentifier: + return "Identifier" + case TokenBracketedIdentifier: + return "BracketedIdentifier" + case TokenText: + return "Text" + case TokenDirection: + return "Direction" + case TokenCommand: + return "Command" + case TokenSayCommand: + return "SayCommand" + case TokenSelf: + return "Self" + case TokenWhitespace: + return "Whitespace" + default: + return fmt.Sprintf("TokenType(%d)", byte(tt)) + } } type Token struct { @@ -58,26 +83,76 @@ func (t Token) Index() int { return t.index } -type tokenizer struct { - commandNameTokenRegex string +func (t Token) String() string { + return fmt.Sprintf("%3d %16v: %q", t.index, t.token, t.lexeme) } -func CreateTokenizer(commandNames []string) *tokenizer { +type tokenPattern struct { + tokenType TokenType + pattern string +} + +type tokenizer struct { + tokenPatterns []tokenPattern +} + +func CreateTokenizer() *tokenizer { return &tokenizer{ - commandNameTokenRegex: `\b(` + strings.Join(commandNames, "|") + `)\b`, + tokenPatterns: []tokenPattern{ + {tokenType: TokenDecimal, pattern: `\b\d+\.\d+\b`}, + {tokenType: TokenNumber, pattern: `\b\d+\b`}, + {tokenType: TokenDirection, pattern: `\b(north|south|east|west|up|down)\b`}, + {tokenType: TokenBracketedIdentifier, pattern: `\[[ a-zA-Z0-9'-][ a-zA-Z0-9'-]*\]`}, + {tokenType: TokenSelf, pattern: `\bself\b`}, + {tokenType: TokenIdentifier, pattern: `\b[a-zA-Z'-][a-zA-Z0-9'-]*\b`}, + {tokenType: TokenWhitespace, pattern: `\s+`}, + {tokenType: TokenUnknown, pattern: `.`}, + }, } } -func (t *tokenizer) Tokenize(commandMsg string) (tokens []Token) { +func (t *tokenizer) Tokenize(commandMsg string) (tokens []Token, err error) { tokens = []Token{} pos := 0 inputLen := len(commandMsg) + // Continue iterating until we reach the end of the input for pos < inputLen { matched := false + remaining := commandMsg[pos:] - for tokenType, pattern := range tokenPatterns { + // Iterate through each token type and test its pattern + for _, pattern := range t.tokenPatterns { + // All patterns are case-insensitive and must match the beginning of the input (^) + re, regexError := regexp.Compile(`(?i)^` + pattern.pattern) + if regexError != nil { + tokens = nil + err = regexError + return + } + + // If the loc isn't nil, that means we've found a match + if loc := re.FindStringIndex(remaining); loc != nil { + lexeme := remaining[loc[0]:loc[1]] + + pos += loc[1] + matched = true + + tokens = append(tokens, CreateToken(pattern.tokenType, lexeme, pos)) + break + } + } + + // Unknown tokens are still added, except carriage return (\r) and newline (\n) + if !matched { + tokens = append(tokens, CreateToken(TokenUnknown, commandMsg[pos:pos+1], pos)) + pos++ } } + + // Mark the end of the tokens + tokens = append(tokens, CreateToken(TokenEOF, "", pos)) + + return } diff --git a/src/Server/main.go b/src/Server/main.go index e591613..285a602 100644 --- a/src/Server/main.go +++ b/src/Server/main.go @@ -19,15 +19,17 @@ type argValue struct { } func main() { - testcmd, err := commandlib.CreateCommand( - "test", - "t", - func(argValues []commandlib.ArgumentValue) (err error) { - err = nil - return - }, - commandlib.CreateStringArg("test", "test message"), - ) + // testcmd, err := commandlib.CreateCommand( + // "test", + // "t", + // func(argValues []commandlib.ArgumentValue) (err error) { + // err = nil + // return + // }, + // commandlib.CreateStringArg("test", "test message"), + // ) + + tokenizer := commandlib.CreateTokenizer() ln, err := net.Listen("tcp", ":8000") @@ -51,25 +53,41 @@ func main() { log.Fatal(err) } - if strings.HasPrefix(message, testcmd.Name()) { - tokens := commandlib.Tokenize(message) - args := []commandlib.ArgumentValue{} + conn.Write([]byte(message + "\n")) - for _, v := range tokens[1:] { - args = append(args, commandlib.CreateArgValue(v)) - } + tokens, err := tokenizer.Tokenize(message) - err := testcmd.DoWork(args) - - if err != nil { - fmt.Print(err.Error()) - } + if err != nil { + response = err.Error() } else { - fmt.Print("Message Received: ", string(message)) + lines := make([]string, len(tokens)) - response = strings.ToUpper(message) + for i, tok := range tokens { + lines[i] = tok.String() + } + + response = strings.Join(lines, "\n") } + // if strings.HasPrefix(message, testcmd.Name()) { + // tokens := commandlib.Tokenize(message) + // args := []commandlib.ArgumentValue{} + + // for _, v := range tokens[1:] { + // args = append(args, commandlib.CreateArgValue(v)) + // } + + // err := testcmd.DoWork(args) + + // if err != nil { + // fmt.Print(err.Error()) + // } + // } else { + // fmt.Print("Message Received: ", string(message)) + + // response = strings.ToUpper(message) + // } + conn.Write([]byte(response + "\n> ")) }