Finish tokenizer

This commit is contained in:
Miroslav Vasilev 2025-06-17 10:53:14 +03:00
parent adbeabc5d4
commit 181f2d2d2c
2 changed files with 134 additions and 41 deletions

View file

@ -1,11 +1,14 @@
package commandlib package commandlib
import "strings" import (
"fmt"
"regexp"
)
type TokenType = byte type TokenType byte
const ( const (
TokenEOF = iota TokenEOF TokenType = iota
TokenUnknown TokenUnknown
@ -13,23 +16,45 @@ const (
TokenDecimal TokenDecimal
TokenIdentifier TokenIdentifier
TokenBracketedIdentifier TokenBracketedIdentifier
TokenText
TokenDirection TokenDirection
TokenCommand TokenCommand
TokenSayCommand
TokenSelf TokenSelf
TokenPunctuation TokenWhitespace
) )
var tokenPatterns = map[TokenType]string{ func (tt TokenType) String() string {
TokenNumber: `\b\d+\b`, switch tt {
TokenDecimal: `\b\d+\.\d+\b`, case TokenEOF:
TokenIdentifier: `\b[a-zA-Z][a-zA-Z0-9]*\b`, return "EOF"
TokenBracketedIdentifier: `\[[a-zA-Z][a-zA-Z0-9]*\]`, case TokenUnknown:
TokenDirection: `\b(north|south|east|west|up|down)\b`, return "Unknown"
TokenSelf: `\bself\b`, case TokenNumber:
TokenPunctuation: `[,.!?'/":;\-\[\]\(\)]`, return "Number"
TokenUnknown: `.`, case TokenDecimal:
return "Decimal"
case TokenIdentifier:
return "Identifier"
case TokenBracketedIdentifier:
return "BracketedIdentifier"
case TokenText:
return "Text"
case TokenDirection:
return "Direction"
case TokenCommand:
return "Command"
case TokenSayCommand:
return "SayCommand"
case TokenSelf:
return "Self"
case TokenWhitespace:
return "Whitespace"
default:
return fmt.Sprintf("TokenType(%d)", byte(tt))
}
} }
type Token struct { type Token struct {
@ -58,26 +83,76 @@ func (t Token) Index() int {
return t.index return t.index
} }
type tokenizer struct { func (t Token) String() string {
commandNameTokenRegex string return fmt.Sprintf("%3d %16v: %q", t.index, t.token, t.lexeme)
} }
func CreateTokenizer(commandNames []string) *tokenizer { type tokenPattern struct {
tokenType TokenType
pattern string
}
type tokenizer struct {
tokenPatterns []tokenPattern
}
func CreateTokenizer() *tokenizer {
return &tokenizer{ return &tokenizer{
commandNameTokenRegex: `\b(` + strings.Join(commandNames, "|") + `)\b`, tokenPatterns: []tokenPattern{
{tokenType: TokenDecimal, pattern: `\b\d+\.\d+\b`},
{tokenType: TokenNumber, pattern: `\b\d+\b`},
{tokenType: TokenDirection, pattern: `\b(north|south|east|west|up|down)\b`},
{tokenType: TokenBracketedIdentifier, pattern: `\[[ a-zA-Z0-9'-][ a-zA-Z0-9'-]*\]`},
{tokenType: TokenSelf, pattern: `\bself\b`},
{tokenType: TokenIdentifier, pattern: `\b[a-zA-Z'-][a-zA-Z0-9'-]*\b`},
{tokenType: TokenWhitespace, pattern: `\s+`},
{tokenType: TokenUnknown, pattern: `.`},
},
} }
} }
func (t *tokenizer) Tokenize(commandMsg string) (tokens []Token) { func (t *tokenizer) Tokenize(commandMsg string) (tokens []Token, err error) {
tokens = []Token{} tokens = []Token{}
pos := 0 pos := 0
inputLen := len(commandMsg) inputLen := len(commandMsg)
// Continue iterating until we reach the end of the input
for pos < inputLen { for pos < inputLen {
matched := false matched := false
remaining := commandMsg[pos:]
for tokenType, pattern := range tokenPatterns { // Iterate through each token type and test its pattern
for _, pattern := range t.tokenPatterns {
// All patterns are case-insensitive and must match the beginning of the input (^)
re, regexError := regexp.Compile(`(?i)^` + pattern.pattern)
if regexError != nil {
tokens = nil
err = regexError
return
}
// If the loc isn't nil, that means we've found a match
if loc := re.FindStringIndex(remaining); loc != nil {
lexeme := remaining[loc[0]:loc[1]]
pos += loc[1]
matched = true
tokens = append(tokens, CreateToken(pattern.tokenType, lexeme, pos))
break
}
}
// Unknown tokens are still added, except carriage return (\r) and newline (\n)
if !matched {
tokens = append(tokens, CreateToken(TokenUnknown, commandMsg[pos:pos+1], pos))
pos++
} }
} }
// Mark the end of the tokens
tokens = append(tokens, CreateToken(TokenEOF, "", pos))
return
} }

View file

@ -19,15 +19,17 @@ type argValue struct {
} }
func main() { func main() {
testcmd, err := commandlib.CreateCommand( // testcmd, err := commandlib.CreateCommand(
"test", // "test",
"t", // "t",
func(argValues []commandlib.ArgumentValue) (err error) { // func(argValues []commandlib.ArgumentValue) (err error) {
err = nil // err = nil
return // return
}, // },
commandlib.CreateStringArg("test", "test message"), // commandlib.CreateStringArg("test", "test message"),
) // )
tokenizer := commandlib.CreateTokenizer()
ln, err := net.Listen("tcp", ":8000") ln, err := net.Listen("tcp", ":8000")
@ -51,25 +53,41 @@ func main() {
log.Fatal(err) log.Fatal(err)
} }
if strings.HasPrefix(message, testcmd.Name()) { conn.Write([]byte(message + "\n"))
tokens := commandlib.Tokenize(message)
args := []commandlib.ArgumentValue{}
for _, v := range tokens[1:] { tokens, err := tokenizer.Tokenize(message)
args = append(args, commandlib.CreateArgValue(v))
}
err := testcmd.DoWork(args) if err != nil {
response = err.Error()
if err != nil {
fmt.Print(err.Error())
}
} else { } else {
fmt.Print("Message Received: ", string(message)) lines := make([]string, len(tokens))
response = strings.ToUpper(message) for i, tok := range tokens {
lines[i] = tok.String()
}
response = strings.Join(lines, "\n")
} }
// if strings.HasPrefix(message, testcmd.Name()) {
// tokens := commandlib.Tokenize(message)
// args := []commandlib.ArgumentValue{}
// for _, v := range tokens[1:] {
// args = append(args, commandlib.CreateArgValue(v))
// }
// err := testcmd.DoWork(args)
// if err != nil {
// fmt.Print(err.Error())
// }
// } else {
// fmt.Print("Message Received: ", string(message))
// response = strings.ToUpper(message)
// }
conn.Write([]byte(response + "\n> ")) conn.Write([]byte(response + "\n> "))
} }