Skip to content

Commit

Permalink
Merge pull request #6 from SEKOIA-IO/feat/detect_encoding
Browse files Browse the repository at this point in the history
feat: Detect file encoding automatically
  • Loading branch information
Darkheir authored Apr 5, 2024
2 parents f7593b6 + 13455ac commit e2b722f
Show file tree
Hide file tree
Showing 15 changed files with 1,853 additions and 2 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ go 1.17

require (
github.com/fsnotify/fsnotify v1.6.0
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7
)

Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY=
github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw=
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs=
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
Expand Down
30 changes: 28 additions & 2 deletions tail.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/SEKOIA-IO/tail/watch"
"gopkg.in/tomb.v1"

"github.com/gogs/chardet"
"golang.org/x/text/encoding/ianaindex"
"golang.org/x/text/transform"
)
Expand Down Expand Up @@ -431,15 +432,40 @@ func (tail *Tail) openReader() {
}

func (tail *Tail) getTransformReader() io.Reader {
if tail.Encoding == "" || strings.ToUpper(tail.Encoding) == "UTF-8" {
encoding := tail.getEncoding()
if strings.ToUpper(encoding) == "UTF-8" {
// No need for a transformer
return tail.file
}
encode, _ := ianaindex.IANA.Encoding(tail.Encoding)
encode, _ := ianaindex.IANA.Encoding(encoding)
reader := transform.NewReader(tail.file, encode.NewDecoder())
return reader
}

func (tail *Tail) getEncoding() string {
if tail.Encoding != "" {
return tail.Encoding
}
// Detect encoding
currentOffset, err := tail.file.Seek(0, io.SeekCurrent)
if err != nil {
return "UTF-8"
}
tail.file.Seek(0, io.SeekStart)
buf := make([]byte, 1024)
_, err = tail.file.Read(buf)
tail.file.Seek(currentOffset, io.SeekStart)
if err != nil {
return "UTF-8"
}
detector := chardet.NewTextDetector()
result, err := detector.DetectBest(buf)
if err != nil || result.Confidence < 80 {
return "UTF-8"
}
return result.Charset
}

func (tail *Tail) seekEnd() error {
return tail.seekTo(SeekInfo{Offset: 0, Whence: io.SeekEnd})
}
Expand Down
102 changes: 102 additions & 0 deletions vendor/github.com/gogs/chardet/2022.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions vendor/github.com/gogs/chardet/AUTHORS

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions vendor/github.com/gogs/chardet/LICENSE

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions vendor/github.com/gogs/chardet/README.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

147 changes: 147 additions & 0 deletions vendor/github.com/gogs/chardet/detector.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit e2b722f

Please sign in to comment.