[#3616] fixed tokenizer whitespace characters trimming
This commit is contained in:
@@ -21,6 +21,8 @@ const eof = rune(0)
|
||||
// DefaultSeparators is a list with the default token separator characters.
|
||||
var DefaultSeparators = []rune{','}
|
||||
|
||||
var whitespaceChars = []rune{'\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0}
|
||||
|
||||
// NewFromString creates new Tokenizer from the provided string.
|
||||
func NewFromString(str string) *Tokenizer {
|
||||
return New(strings.NewReader(str))
|
||||
@@ -33,12 +35,11 @@ func NewFromBytes(b []byte) *Tokenizer {
|
||||
|
||||
// New creates new Tokenizer from the provided reader with DefaultSeparators.
|
||||
func New(r io.Reader) *Tokenizer {
|
||||
return &Tokenizer{
|
||||
r: bufio.NewReader(r),
|
||||
separators: DefaultSeparators,
|
||||
keepSeparator: false,
|
||||
ignoreParenthesis: false,
|
||||
}
|
||||
t := &Tokenizer{r: bufio.NewReader(r)}
|
||||
|
||||
t.Separators(DefaultSeparators...)
|
||||
|
||||
return t
|
||||
}
|
||||
|
||||
// Tokenizer defines a struct that parses a reader into tokens while
|
||||
@@ -46,14 +47,18 @@ func New(r io.Reader) *Tokenizer {
|
||||
type Tokenizer struct {
|
||||
r *bufio.Reader
|
||||
|
||||
trimCutset string
|
||||
separators []rune
|
||||
keepSeparator bool
|
||||
keepEmptyTokens bool
|
||||
ignoreParenthesis bool
|
||||
}
|
||||
|
||||
// Separators defines the provided separatos of the current Tokenizer.
|
||||
func (t *Tokenizer) Separators(separators ...rune) {
|
||||
t.separators = separators
|
||||
|
||||
t.rebuildTrimCutset()
|
||||
}
|
||||
|
||||
// KeepSeparator defines whether to keep the separator rune as part
|
||||
@@ -62,35 +67,37 @@ func (t *Tokenizer) KeepSeparator(state bool) {
|
||||
t.keepSeparator = state
|
||||
}
|
||||
|
||||
// KeepEmptyTokens defines whether to keep empty tokens on Scan() (default to false).
|
||||
func (t *Tokenizer) KeepEmptyTokens(state bool) {
|
||||
t.keepEmptyTokens = state
|
||||
}
|
||||
|
||||
// IgnoreParenthesis defines whether to ignore the parenthesis boundaries
|
||||
// and to treat the '(' and ')' as regular characters.
|
||||
func (t *Tokenizer) IgnoreParenthesis(state bool) {
|
||||
t.ignoreParenthesis = state
|
||||
}
|
||||
|
||||
// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed).
|
||||
// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed!).
|
||||
//
|
||||
// Empty tokens are skipped if t.keepEmptyTokens is not set (which is the default).
|
||||
//
|
||||
// Returns [io.EOF] error when there are no more tokens to scan.
|
||||
func (t *Tokenizer) Scan() (string, error) {
|
||||
ch := t.read()
|
||||
|
||||
if ch == eof {
|
||||
return "", io.EOF
|
||||
}
|
||||
|
||||
if isWhitespaceRune(ch) {
|
||||
t.readWhiteSpaces()
|
||||
} else {
|
||||
t.unread()
|
||||
}
|
||||
t.unread()
|
||||
|
||||
token, err := t.readToken()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// read all remaining whitespaces
|
||||
t.readWhiteSpaces()
|
||||
if !t.keepEmptyTokens && token == "" {
|
||||
return t.Scan()
|
||||
}
|
||||
|
||||
return token, err
|
||||
}
|
||||
@@ -129,12 +136,12 @@ func (t *Tokenizer) readToken() (string, error) {
|
||||
break
|
||||
}
|
||||
|
||||
if !isEscapeRune(prevCh) {
|
||||
if !t.isEscapeRune(prevCh) {
|
||||
if !t.ignoreParenthesis && ch == '(' && quoteCh == eof {
|
||||
parenthesis++ // opening parenthesis
|
||||
} else if !t.ignoreParenthesis && ch == ')' && parenthesis > 0 && quoteCh == eof {
|
||||
parenthesis-- // closing parenthesis
|
||||
} else if isQuoteRune(ch) {
|
||||
} else if t.isQuoteRune(ch) {
|
||||
if quoteCh == ch {
|
||||
quoteCh = eof // closing quote
|
||||
} else if quoteCh == eof {
|
||||
@@ -158,7 +165,7 @@ func (t *Tokenizer) readToken() (string, error) {
|
||||
return "", fmt.Errorf("unbalanced parenthesis or quoted expression: %q", buf.String())
|
||||
}
|
||||
|
||||
return buf.String(), nil
|
||||
return strings.Trim(buf.String(), t.trimCutset), nil
|
||||
}
|
||||
|
||||
// readWhiteSpaces consumes all contiguous whitespace runes.
|
||||
@@ -170,7 +177,7 @@ func (t *Tokenizer) readWhiteSpaces() {
|
||||
break
|
||||
}
|
||||
|
||||
if !t.isSeperatorRune(ch) {
|
||||
if !t.isWhitespaceRune(ch) {
|
||||
t.unread()
|
||||
break
|
||||
}
|
||||
@@ -193,6 +200,20 @@ func (t *Tokenizer) unread() error {
|
||||
return t.r.UnreadRune()
|
||||
}
|
||||
|
||||
// rebuildTrimCutset rebuilds the tokenizer trimCutset based on its separator runes.
|
||||
func (t *Tokenizer) rebuildTrimCutset() {
|
||||
var cutset strings.Builder
|
||||
|
||||
for _, w := range whitespaceChars {
|
||||
if t.isSeperatorRune(w) {
|
||||
continue
|
||||
}
|
||||
cutset.WriteRune(w)
|
||||
}
|
||||
|
||||
t.trimCutset = cutset.String()
|
||||
}
|
||||
|
||||
// isSeperatorRune checks if a rune is a token part separator.
|
||||
func (t *Tokenizer) isSeperatorRune(ch rune) bool {
|
||||
for _, r := range t.separators {
|
||||
@@ -204,17 +225,23 @@ func (t *Tokenizer) isSeperatorRune(ch rune) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// isWhitespaceRune checks if a rune is a space, tab, or newline.
|
||||
func isWhitespaceRune(ch rune) bool {
|
||||
return ch == ' ' || ch == '\t' || ch == '\n'
|
||||
// isWhitespaceRune checks if a rune is a space character (eg. space, tab, new line).
|
||||
func (t *Tokenizer) isWhitespaceRune(ch rune) bool {
|
||||
for _, c := range whitespaceChars {
|
||||
if c == ch {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// isQuoteRune checks if a rune is a quote.
|
||||
func isQuoteRune(ch rune) bool {
|
||||
func (t *Tokenizer) isQuoteRune(ch rune) bool {
|
||||
return ch == '\'' || ch == '"' || ch == '`'
|
||||
}
|
||||
|
||||
// isEscapeRune checks if a rune is an escape character.
|
||||
func isEscapeRune(ch rune) bool {
|
||||
func (t *Tokenizer) isEscapeRune(ch rune) bool {
|
||||
return ch == '\\'
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user