Skip to content

Commit

Permalink
support gzipped input
Browse files Browse the repository at this point in the history
  • Loading branch information
wmentor committed Nov 26, 2023
1 parent 80a6c50 commit 4ba38d5
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 4 deletions.
7 changes: 7 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.PHONY: all test

all: test

test:
go clean -testcache
go test ./... -cover
11 changes: 10 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,13 @@ module github.com/wmentor/tokens

go 1.20

require github.com/wmentor/tbuf v1.0.1
require (
github.com/stretchr/testify v1.8.4
github.com/wmentor/tbuf v1.0.1
)

require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
6 changes: 6 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/wmentor/tbuf v1.0.1 h1:IonHpWwR0Wyh3Jfu0AbGSqzVDzUZ1zU61ML5F1CdBno=
github.com/wmentor/tbuf v1.0.1/go.mod h1:1lO+hvrkqqjEcR74vrNfBL3jg0NnpGHDWHeFxRsk7js=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
34 changes: 32 additions & 2 deletions tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ package tokens

import (
"bufio"
"compress/gzip"
"io"
"strings"
"sync"
"unicode"

buffer "github.com/wmentor/tbuf"
Expand All @@ -17,6 +19,10 @@ const (
bufferSize = 5
)

var (
_ io.Closer = (*Tokenizer)(nil)
)

// Option - опция конструктора.
type Option func(*Tokenizer)

Expand All @@ -30,6 +36,8 @@ type Tokenizer struct {
mkr2 strings.Builder
rewriteRune map[rune]rune
fsmTab []stateFunc
toClose []io.Closer
once sync.Once
prevRune rune
endDone bool
caseSensitive bool
Expand All @@ -45,7 +53,7 @@ func WithCaseSensitive() Option {
}

// New - конструктор нового Tokenizer.
func New(rh io.Reader, opts ...Option) *Tokenizer {
func New(rh io.Reader, opts ...Option) (*Tokenizer, error) {
buf, _ := buffer.New(bufferSize)

tokenizer := &Tokenizer{
Expand All @@ -56,6 +64,19 @@ func New(rh io.Reader, opts ...Option) *Tokenizer {
fsmTab: make([]stateFunc, 11),
}

testBytes, err := tokenizer.rd.Peek(2)

// check first 2 bytes from GZIP file format specification https://www.ietf.org/rfc/rfc1952.txt
if err == nil && testBytes[0] == 31 && testBytes[1] == 139 {
gzreader, err := gzip.NewReader(tokenizer.rd)
if err != nil {
return nil, err
}
tokenizer.toClose = append(tokenizer.toClose, gzreader)

tokenizer.rd = bufio.NewReader(gzreader)
}

tokenizer.fsmTab[0] = tokenizer.state0
tokenizer.fsmTab[1] = tokenizer.state1
tokenizer.fsmTab[2] = tokenizer.state2
Expand All @@ -72,7 +93,16 @@ func New(rh io.Reader, opts ...Option) *Tokenizer {
opt(tokenizer)
}

return tokenizer
return tokenizer, nil
}

func (t *Tokenizer) Close() error {
t.once.Do(func() {
for _, rec := range t.toClose {
rec.Close()
}
})
return nil
}

// Token - возвращает следующий токен или miner.ErrEndInput, если достигли конца.
Expand Down
51 changes: 50 additions & 1 deletion tokenizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,14 @@
package tokens_test

import (
"bytes"
"compress/gzip"
"log"
"strings"
"testing"

"github.com/stretchr/testify/require"

generic "github.com/wmentor/tokens"
)

Expand All @@ -17,7 +22,34 @@ func tF(t *testing.T, src string, caseSensitive bool, wait string) {
if caseSensitive {
opts = append(opts, generic.WithCaseSensitive())
}
parser := generic.New(strings.NewReader(src), opts...)
parser, err := generic.New(strings.NewReader(src), opts...)
require.NoError(t, err)
defer parser.Close()

for {
tok, err := parser.Token()
if err != nil {
break
}
res = append(res, tok)
}

if strings.Join(res, "|") != wait {
t.Fatalf("test failed src=%s ret=%v wait=%v", src, res, wait)
}
}

func tFB(t *testing.T, src []byte, caseSensitive bool, wait string) {
t.Helper()

res := make([]string, 0, len(wait))
opts := make([]generic.Option, 0, 1)
if caseSensitive {
opts = append(opts, generic.WithCaseSensitive())
}
parser, err := generic.New(bytes.NewReader(src), opts...)
require.NoError(t, err)
defer parser.Close()

for {
tok, err := parser.Token()
Expand Down Expand Up @@ -224,3 +256,20 @@ func TestParser037(t *testing.T) {
tF(t, "победа муад'диба", false,
"победа|муад'диба")
}

func TestParser038(t *testing.T) {
t.Parallel()

txt := "Working with gzip"

b := bytes.NewBuffer(nil)
gz := gzip.NewWriter(b)
if _, err := gz.Write([]byte(txt)); err != nil {
log.Fatal(err)
}
if err := gz.Close(); err != nil {
log.Fatal(err)
}

tFB(t, b.Bytes(), false, "working|with|gzip")
}

0 comments on commit 4ba38d5

Please sign in to comment.