diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..04c4bf9 --- /dev/null +++ b/Makefile @@ -0,0 +1,7 @@ +.PHONY: all test + +all: test + +test: + go clean -testcache + go test ./... -cover diff --git a/go.mod b/go.mod index fab91c8..0635c07 100644 --- a/go.mod +++ b/go.mod @@ -2,4 +2,13 @@ module github.com/wmentor/tokens go 1.20 -require github.com/wmentor/tbuf v1.0.1 +require ( + github.com/stretchr/testify v1.8.4 + github.com/wmentor/tbuf v1.0.1 +) + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/go.sum b/go.sum index 2369632..a3c5ea5 100644 --- a/go.sum +++ b/go.sum @@ -1,6 +1,12 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/wmentor/tbuf v1.0.1 h1:IonHpWwR0Wyh3Jfu0AbGSqzVDzUZ1zU61ML5F1CdBno= github.com/wmentor/tbuf v1.0.1/go.mod h1:1lO+hvrkqqjEcR74vrNfBL3jg0NnpGHDWHeFxRsk7js= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/tokenizer.go b/tokenizer.go index 24d615d..3d5a216 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -4,8 +4,10 @@ package tokens import ( "bufio" + "compress/gzip" "io" "strings" + "sync" "unicode" buffer "github.com/wmentor/tbuf" @@ -17,6 +19,10 @@ const ( bufferSize = 5 ) +var ( + _ io.Closer = (*Tokenizer)(nil) +) + // Option - опция конструктора. type Option func(*Tokenizer) @@ -30,6 +36,8 @@ type Tokenizer struct { mkr2 strings.Builder rewriteRune map[rune]rune fsmTab []stateFunc + toClose []io.Closer + once sync.Once prevRune rune endDone bool caseSensitive bool @@ -45,7 +53,7 @@ func WithCaseSensitive() Option { } // New - конструктор нового Tokenizer. -func New(rh io.Reader, opts ...Option) *Tokenizer { +func New(rh io.Reader, opts ...Option) (*Tokenizer, error) { buf, _ := buffer.New(bufferSize) tokenizer := &Tokenizer{ @@ -56,6 +64,19 @@ func New(rh io.Reader, opts ...Option) *Tokenizer { fsmTab: make([]stateFunc, 11), } + testBytes, err := tokenizer.rd.Peek(2) + + // check first 2 bytes from GZIP file format specification https://www.ietf.org/rfc/rfc1952.txt + if err == nil && testBytes[0] == 31 && testBytes[1] == 139 { + gzreader, err := gzip.NewReader(tokenizer.rd) + if err != nil { + return nil, err + } + tokenizer.toClose = append(tokenizer.toClose, gzreader) + + tokenizer.rd = bufio.NewReader(gzreader) + } + tokenizer.fsmTab[0] = tokenizer.state0 tokenizer.fsmTab[1] = tokenizer.state1 tokenizer.fsmTab[2] = tokenizer.state2 @@ -72,7 +93,16 @@ func New(rh io.Reader, opts ...Option) *Tokenizer { opt(tokenizer) } - return tokenizer + return tokenizer, nil +} + +func (t *Tokenizer) Close() error { + t.once.Do(func() { + for _, rec := range t.toClose { + rec.Close() + } + }) + return nil } // Token - возвращает следующий токен или miner.ErrEndInput, если достигли конца. diff --git a/tokenizer_test.go b/tokenizer_test.go index 4804f76..4dd7f25 100644 --- a/tokenizer_test.go +++ b/tokenizer_test.go @@ -3,9 +3,14 @@ package tokens_test import ( + "bytes" + "compress/gzip" + "log" "strings" "testing" + "github.com/stretchr/testify/require" + generic "github.com/wmentor/tokens" ) @@ -17,7 +22,34 @@ func tF(t *testing.T, src string, caseSensitive bool, wait string) { if caseSensitive { opts = append(opts, generic.WithCaseSensitive()) } - parser := generic.New(strings.NewReader(src), opts...) + parser, err := generic.New(strings.NewReader(src), opts...) + require.NoError(t, err) + defer parser.Close() + + for { + tok, err := parser.Token() + if err != nil { + break + } + res = append(res, tok) + } + + if strings.Join(res, "|") != wait { + t.Fatalf("test failed src=%s ret=%v wait=%v", src, res, wait) + } +} + +func tFB(t *testing.T, src []byte, caseSensitive bool, wait string) { + t.Helper() + + res := make([]string, 0, len(wait)) + opts := make([]generic.Option, 0, 1) + if caseSensitive { + opts = append(opts, generic.WithCaseSensitive()) + } + parser, err := generic.New(bytes.NewReader(src), opts...) + require.NoError(t, err) + defer parser.Close() for { tok, err := parser.Token() @@ -224,3 +256,20 @@ func TestParser037(t *testing.T) { tF(t, "победа муад'диба", false, "победа|муад'диба") } + +func TestParser038(t *testing.T) { + t.Parallel() + + txt := "Working with gzip" + + b := bytes.NewBuffer(nil) + gz := gzip.NewWriter(b) + if _, err := gz.Write([]byte(txt)); err != nil { + log.Fatal(err) + } + if err := gz.Close(); err != nil { + log.Fatal(err) + } + + tFB(t, b.Bytes(), false, "working|with|gzip") +}