added generic tokenizer helper

2023-02-05 20:53:48 +02:00
parent 1b21e86be6
commit 23dfa9c634
2 changed files with 386 additions and 0 deletions
@@ -0,0 +1,201 @@
+// Package tokenizer implements a rudimentary tokens parser of buffered
+// io.Reader while respecting quotes and parenthesis boundaries.
+//
+// Example
+//
+//	tk := tokenizer.NewFromString("a, b, (c, d)")
+//	result, _ := tk.ScanAll() // ["a", "b", "(c, d)"]
+package tokenizer
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"io"
+	"strings"
+)
+
+// eof represents a marker rune for the end of the reader.
+const eof = rune(0)
+
+// DefaultSeparators is a list with the default token separator characters.
+var DefaultSeparators = []rune{',', ' ', '\t', '\n'}
+
+// NewFromString creates new Tokenizer from the provided string.
+func NewFromString(str string) *Tokenizer {
+	return New(strings.NewReader(str))
+}
+
+// NewFromBytes creates new Tokenizer from the provided bytes slice.
+func NewFromBytes(b []byte) *Tokenizer {
+	return New(bytes.NewReader(b))
+}
+
+// New creates new Tokenizer from the provided reader.
+func New(r io.Reader) *Tokenizer {
+	return &Tokenizer{
+		r:          bufio.NewReader(r),
+		separators: DefaultSeparators,
+	}
+}
+
+// Tokenizer defines a struct that parses a reader into tokens while
+// respecting quotes and parenthesis boundaries.
+type Tokenizer struct {
+	r *bufio.Reader
+
+	separators []rune
+}
+
+// SetSeparators specifies the provided separatos of the current Tokenizer.
+func (s *Tokenizer) SetSeparators(separators ...rune) {
+	s.separators = separators
+}
+
+// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed).
+//
+// Returns [io.EOF] error when there are no more tokens to scan.
+func (s *Tokenizer) Scan() (string, error) {
+	ch := s.read()
+
+	if ch == eof {
+		return "", io.EOF
+	}
+
+	if isWhitespaceRune(ch) {
+		s.readWhiteSpaces()
+	} else {
+		s.unread()
+	}
+
+	token, err := s.readToken()
+	if err != nil {
+		return "", err
+	}
+
+	// read all remaining whitespaces
+	s.readWhiteSpaces()
+
+	return token, err
+}
+
+// ScanAll reads the entire Tokenizer's buffer and return all found tokens.
+func (s *Tokenizer) ScanAll() ([]string, error) {
+	tokens := []string{}
+
+	for {
+		token, err := s.Scan()
+		if err != nil {
+			if err == io.EOF {
+				break
+			}
+
+			return nil, err
+		}
+
+		tokens = append(tokens, token)
+	}
+
+	return tokens, nil
+}
+
+// readToken reads a single token from the buffer and returns it.
+func (s *Tokenizer) readToken() (string, error) {
+	var buf bytes.Buffer
+	var parenthesis int
+	var quoteCh rune
+	var prevCh rune
+
+	for {
+		ch := s.read()
+
+		if ch == eof {
+			break
+		}
+
+		if !isEscapeRune(prevCh) {
+			if ch == '(' && quoteCh == eof {
+				parenthesis++
+			} else if ch == ')' && parenthesis > 0 && quoteCh == eof {
+				parenthesis--
+			} else if isQuoteRune(ch) {
+				if quoteCh == ch {
+					quoteCh = eof // reached closing quote
+				} else if quoteCh == eof {
+					quoteCh = ch // opening quote
+				}
+			}
+		}
+
+		if s.isSeperatorRune(ch) && parenthesis == 0 && quoteCh == eof {
+			break
+		}
+
+		prevCh = ch
+		buf.WriteRune(ch)
+	}
+
+	if parenthesis > 0 || quoteCh != eof {
+		return "", fmt.Errorf("unbalanced parenthesis or quoted expression: %q", buf.String())
+	}
+
+	return buf.String(), nil
+}
+
+// readWhiteSpaces consumes all contiguous whitespace runes.
+func (s *Tokenizer) readWhiteSpaces() {
+	for {
+		ch := s.read()
+
+		if ch == eof {
+			break
+		}
+
+		if !s.isSeperatorRune(ch) {
+			s.unread()
+			break
+		}
+	}
+}
+
+// read reads the next rune from the buffered reader.
+// Returns the `rune(0)` if an error or `io.EOF` occurs.
+func (s *Tokenizer) read() rune {
+	ch, _, err := s.r.ReadRune()
+	if err != nil {
+		return eof
+	}
+
+	return ch
+}
+
+// unread places the previously read rune back on the reader.
+func (s *Tokenizer) unread() error {
+	return s.r.UnreadRune()
+}
+
+// isSeperatorRune checks if a rune is a token part separator.
+func (s *Tokenizer) isSeperatorRune(ch rune) bool {
+	for _, r := range s.separators {
+		if ch == r {
+			return true
+		}
+	}
+
+	return false
+}
+
+// isWhitespaceRune checks if a rune is a space, tab, or newline.
+func isWhitespaceRune(ch rune) bool {
+	return ch == ' ' || ch == '\t' || ch == '\n'
+}
+
+// isQuoteRune checks if a rune is a quote.
+func isQuoteRune(ch rune) bool {
+	return ch == '\'' || ch == '"' || ch == '`'
+}
+
+// isEscapeRune checks if a rune is an escape character.
+func isEscapeRune(ch rune) bool {
+	return ch == '\\'
+}