From f6dcac11b17a3dd55abdff9e929f085339683be8 Mon Sep 17 00:00:00 2001 From: Charlie Vieth Date: Sat, 6 Apr 2024 23:14:19 -0400 Subject: [PATCH] WIP: move parsing back to fastwalk --- fastwalk_getdirentries_darwin.go | 56 ++--------------- fastwalk_unix.go | 93 ++++++++++++++++------------- internal/dirent/dirent.go | 59 +----------------- internal/dirent/dirent_aix.go | 10 ++-- internal/dirent/dirent_darwin.go | 46 ++++++++++++++ internal/dirent/dirent_dragonfly.go | 10 ++-- internal/dirent/dirent_freebsd.go | 8 +-- internal/dirent/dirent_js.go | 10 ++-- internal/dirent/dirent_linux.go | 10 ++-- internal/dirent/dirent_netbsd.go | 8 +-- internal/dirent/dirent_openbsd.go | 8 +-- internal/dirent/dirent_solaris.go | 10 ++-- 12 files changed, 140 insertions(+), 188 deletions(-) create mode 100644 internal/dirent/dirent_darwin.go diff --git a/fastwalk_getdirentries_darwin.go b/fastwalk_getdirentries_darwin.go index bc581bc..4867937 100644 --- a/fastwalk_getdirentries_darwin.go +++ b/fastwalk_getdirentries_darwin.go @@ -8,8 +8,11 @@ import ( "sync" "syscall" "unsafe" + + "github.com/charlievieth/fastwalk/internal/dirent" ) +// TODO: increase const direntBufSize = 32 * 1024 var direntBufPool = sync.Pool{ @@ -43,18 +46,18 @@ func readDir(dirName string, fn func(dirName, entName string, de fs.DirEntry) er buf := dbuf[:length] for i := 0; len(buf) > 0; i++ { - reclen, ok := direntReclen(buf) + reclen, ok := dirent.DirentReclen(buf) if !ok || reclen > uint64(len(buf)) { break } rec := buf[:reclen] buf = buf[reclen:] - typ := direntType(rec) + typ := dirent.DirentType(rec) if skipFiles && typ.IsRegular() { continue } const namoff = uint64(unsafe.Offsetof(syscall.Dirent{}.Name)) - namlen, ok := direntNamlen(rec) + namlen, ok := dirent.DirentNamlen(rec) if !ok || namoff+namlen > uint64(len(rec)) { break } @@ -80,50 +83,3 @@ func readDir(dirName string, fn func(dirName, entName string, de fs.DirEntry) er return nil } - -// readInt returns the size-bytes unsigned integer in native byte order at offset off. -func readInt(b []byte, off, size uintptr) (uint64, bool) { - if len(b) >= int(off+size) { - p := b[off:] - _ = p[1] // bounds check hint to compiler; see golang.org/issue/14808 - return uint64(p[0]) | uint64(p[1])<<8, true - } - return 0, false -} - -// Statically assert that the size of Reclen and Namlen is 2. -var _ = ([2]int{})[unsafe.Sizeof(syscall.Dirent{}.Reclen)-1] -var _ = ([2]int{})[unsafe.Sizeof(syscall.Dirent{}.Namlen)-1] - -func direntReclen(buf []byte) (uint64, bool) { - return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Reclen), unsafe.Sizeof(syscall.Dirent{}.Reclen)) -} - -func direntNamlen(buf []byte) (uint64, bool) { - return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Namlen), unsafe.Sizeof(syscall.Dirent{}.Namlen)) -} - -func direntType(buf []byte) os.FileMode { - off := unsafe.Offsetof(syscall.Dirent{}.Type) - if off >= uintptr(len(buf)) { - return ^os.FileMode(0) // unknown - } - typ := buf[off] - switch typ { - case syscall.DT_BLK: - return os.ModeDevice - case syscall.DT_CHR: - return os.ModeDevice | os.ModeCharDevice - case syscall.DT_DIR: - return os.ModeDir - case syscall.DT_FIFO: - return os.ModeNamedPipe - case syscall.DT_LNK: - return os.ModeSymlink - case syscall.DT_REG: - return 0 - case syscall.DT_SOCK: - return os.ModeSocket - } - return ^os.FileMode(0) -} diff --git a/fastwalk_unix.go b/fastwalk_unix.go index ad85f9e..7e8ba67 100644 --- a/fastwalk_unix.go +++ b/fastwalk_unix.go @@ -9,17 +9,22 @@ package fastwalk import ( "io/fs" "os" + "sync" "syscall" + "unsafe" "github.com/charlievieth/fastwalk/internal/dirent" ) -// More than 5760 to work around https://golang.org/issue/24015. -const blockSize = 8192 +// Empirical testing shows that 32k is the ideal buffer size. +const direntBufSize = 32 * 1024 -// unknownFileMode is a sentinel (and bogus) os.FileMode -// value used to represent a syscall.DT_UNKNOWN Dirent.Type. -const unknownFileMode os.FileMode = os.ModeNamedPipe | os.ModeSocket | os.ModeDevice +var direntBufPool = sync.Pool{ + New: func() interface{} { + b := make([]byte, direntBufSize) + return &b + }, +} func readDir(dirName string, fn func(dirName, entName string, de fs.DirEntry) error) error { fd, err := open(dirName, 0, 0) @@ -28,52 +33,54 @@ func readDir(dirName string, fn func(dirName, entName string, de fs.DirEntry) er } defer syscall.Close(fd) - // The buffer must be at least a block long. - buf := make([]byte, blockSize) // stack-allocated; doesn't escape - bufp := 0 // starting read position in buf - nbuf := 0 // end valid data in buf + pb := direntBufPool.Get().(*[]byte) + defer direntBufPool.Put(pb) + bbuf := *pb + skipFiles := false for { - if bufp >= nbuf { - bufp = 0 - nbuf, err = readDirent(fd, buf) - if err != nil { - return os.NewSyscallError("readdirent", err) - } - if nbuf <= 0 { - return nil - } + n, err := readDirent(fd, bbuf) + if err != nil { + return err } - consumed, name, typ := dirent.Parse(buf[bufp:nbuf]) - bufp += consumed - - if name == "" || name == "." || name == ".." { - continue + if n <= 0 { + return nil } - // Fallback for filesystems (like old XFS) that don't - // support Dirent.Type and have DT_UNKNOWN (0) there - // instead. - if typ == unknownFileMode { - fi, err := os.Lstat(dirName + "/" + name) - if err != nil { - // It got deleted in the meantime. - if os.IsNotExist(err) { - continue + buf := bbuf[:n:n] + + for len(buf) > 0 { + reclen, ok := dirent.DirentReclen(buf) + if !ok || reclen > uint64(len(buf)) { + return nil + } + rec := buf[:reclen] + buf = buf[reclen:] + typ := dirent.DirentType(rec) + if skipFiles && typ.IsRegular() { + continue + } + const namoff = uint64(unsafe.Offsetof(syscall.Dirent{}.Name)) + namlen, ok := dirent.DirentNamlen(rec) + if !ok || namoff+namlen > uint64(len(rec)) { + break + } + name := rec[namoff : namoff+namlen] + for i, c := range name { + if c == 0 { + name = name[:i] + break } - return err } - typ = fi.Mode() & os.ModeType - } - if skipFiles && typ.IsRegular() { - continue - } - de := newUnixDirent(dirName, name, typ) - if err := fn(dirName, name, de); err != nil { - if err == ErrSkipFiles { - skipFiles = true + if string(name) == "." || string(name) == ".." { continue } - return err + nm := string(name) + if err := fn(dirName, nm, newUnixDirent(dirName, nm, typ)); err != nil { + if err != ErrSkipFiles { + return err + } + skipFiles = true + } } } } diff --git a/internal/dirent/dirent.go b/internal/dirent/dirent.go index 77a2989..0fd8ccd 100644 --- a/internal/dirent/dirent.go +++ b/internal/dirent/dirent.go @@ -1,14 +1,7 @@ -//go:build aix || dragonfly || freebsd || (js && wasm) || linux || netbsd || openbsd || solaris +//go:build aix || darwin || dragonfly || freebsd || (js && wasm) || linux || netbsd || openbsd || solaris package dirent -import ( - "os" - "runtime" - "syscall" - "unsafe" -) - // readInt returns the size-bytes unsigned integer in native byte order at offset off. func readInt(b []byte, off, size uintptr) (u uint64, ok bool) { if len(b) < int(off+size) { @@ -57,53 +50,3 @@ func readIntLE(b []byte, size uintptr) uint64 { panic("syscall: readInt with unsupported size") } } - -const InvalidMode = os.FileMode(1<<32 - 1) - -func Parse(buf []byte) (consumed int, name string, typ os.FileMode) { - - reclen, ok := direntReclen(buf) - if !ok || reclen > uint64(len(buf)) { - // WARN: this is a hard error because we consumed 0 bytes - // and not stopping here could lead to an infinite loop. - return 0, "", InvalidMode - } - consumed = int(reclen) - rec := buf[:reclen] - - ino, ok := direntIno(rec) - if !ok { - return consumed, "", InvalidMode - } - // When building to wasip1, the host runtime might be running on Windows - // or might expose a remote file system which does not have the concept - // of inodes. Therefore, we cannot make the assumption that it is safe - // to skip entries with zero inodes. - if ino == 0 && runtime.GOOS != "wasip1" { - return consumed, "", InvalidMode - } - - typ = direntType(buf) - - const namoff = uint64(unsafe.Offsetof(syscall.Dirent{}.Name)) - namlen, ok := direntNamlen(rec) - if !ok || namoff+namlen > uint64(len(rec)) { - return consumed, "", InvalidMode - } - namebuf := rec[namoff : namoff+namlen] - for i, c := range namebuf { - if c == 0 { - namebuf = namebuf[:i] - break - } - } - // Check for useless names before allocating a string. - if string(namebuf) == "." { - name = "." - } else if string(namebuf) == ".." { - name = ".." - } else { - name = string(namebuf) - } - return consumed, name, typ -} diff --git a/internal/dirent/dirent_aix.go b/internal/dirent/dirent_aix.go index 15d8435..d915528 100644 --- a/internal/dirent/dirent_aix.go +++ b/internal/dirent/dirent_aix.go @@ -8,22 +8,22 @@ import ( "unsafe" ) -func direntIno(buf []byte) (uint64, bool) { +func DirentIno(buf []byte) (uint64, bool) { return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Ino), unsafe.Sizeof(syscall.Dirent{}.Ino)) } -func direntReclen(buf []byte) (uint64, bool) { +func DirentReclen(buf []byte) (uint64, bool) { return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Reclen), unsafe.Sizeof(syscall.Dirent{}.Reclen)) } -func direntNamlen(buf []byte) (uint64, bool) { - reclen, ok := direntReclen(buf) +func DirentNamlen(buf []byte) (uint64, bool) { + reclen, ok := DirentReclen(buf) if !ok { return 0, false } return reclen - uint64(unsafe.Offsetof(syscall.Dirent{}.Name)), true } -func direntType(buf []byte) os.FileMode { +func DirentType(buf []byte) os.FileMode { return ^os.FileMode(0) // unknown } diff --git a/internal/dirent/dirent_darwin.go b/internal/dirent/dirent_darwin.go new file mode 100644 index 0000000..c502180 --- /dev/null +++ b/internal/dirent/dirent_darwin.go @@ -0,0 +1,46 @@ +//go:build darwin + +package dirent + +import ( + "os" + "syscall" + "unsafe" +) + +func DirentIno(buf []byte) (uint64, bool) { + return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Ino), unsafe.Sizeof(syscall.Dirent{}.Ino)) +} + +func DirentReclen(buf []byte) (uint64, bool) { + return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Reclen), unsafe.Sizeof(syscall.Dirent{}.Reclen)) +} + +func DirentNamlen(buf []byte) (uint64, bool) { + return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Namlen), unsafe.Sizeof(syscall.Dirent{}.Namlen)) +} + +func DirentType(buf []byte) os.FileMode { + off := unsafe.Offsetof(syscall.Dirent{}.Type) + if off >= uintptr(len(buf)) { + return ^os.FileMode(0) // unknown + } + typ := buf[off] + switch typ { + case syscall.DT_BLK: + return os.ModeDevice + case syscall.DT_CHR: + return os.ModeDevice | os.ModeCharDevice + case syscall.DT_DIR: + return os.ModeDir + case syscall.DT_FIFO: + return os.ModeNamedPipe + case syscall.DT_LNK: + return os.ModeSymlink + case syscall.DT_REG: + return 0 + case syscall.DT_SOCK: + return os.ModeSocket + } + return ^os.FileMode(0) +} diff --git a/internal/dirent/dirent_dragonfly.go b/internal/dirent/dirent_dragonfly.go index f9065f5..58f1133 100644 --- a/internal/dirent/dirent_dragonfly.go +++ b/internal/dirent/dirent_dragonfly.go @@ -8,23 +8,23 @@ import ( "unsafe" ) -func direntIno(buf []byte) (uint64, bool) { +func DirentIno(buf []byte) (uint64, bool) { return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Fileno), unsafe.Sizeof(syscall.Dirent{}.Fileno)) } -func direntReclen(buf []byte) (uint64, bool) { - namlen, ok := direntNamlen(buf) +func DirentReclen(buf []byte) (uint64, bool) { + namlen, ok := DirentNamlen(buf) if !ok { return 0, false } return (16 + namlen + 1 + 7) &^ 7, true } -func direntNamlen(buf []byte) (uint64, bool) { +func DirentNamlen(buf []byte) (uint64, bool) { return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Namlen), unsafe.Sizeof(syscall.Dirent{}.Namlen)) } -func direntType(buf []byte) os.FileMode { +func DirentType(buf []byte) os.FileMode { off := unsafe.Offsetof(syscall.Dirent{}.Type) if off >= uintptr(len(buf)) { return ^os.FileMode(0) // unknown diff --git a/internal/dirent/dirent_freebsd.go b/internal/dirent/dirent_freebsd.go index 2152518..4b07118 100644 --- a/internal/dirent/dirent_freebsd.go +++ b/internal/dirent/dirent_freebsd.go @@ -8,19 +8,19 @@ import ( "unsafe" ) -func direntIno(buf []byte) (uint64, bool) { +func DirentIno(buf []byte) (uint64, bool) { return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Fileno), unsafe.Sizeof(syscall.Dirent{}.Fileno)) } -func direntReclen(buf []byte) (uint64, bool) { +func DirentReclen(buf []byte) (uint64, bool) { return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Reclen), unsafe.Sizeof(syscall.Dirent{}.Reclen)) } -func direntNamlen(buf []byte) (uint64, bool) { +func DirentNamlen(buf []byte) (uint64, bool) { return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Namlen), unsafe.Sizeof(syscall.Dirent{}.Namlen)) } -func direntType(buf []byte) os.FileMode { +func DirentType(buf []byte) os.FileMode { off := unsafe.Offsetof(syscall.Dirent{}.Type) if off >= uintptr(len(buf)) { return ^os.FileMode(0) // unknown diff --git a/internal/dirent/dirent_js.go b/internal/dirent/dirent_js.go index 18bef93..8139a86 100644 --- a/internal/dirent/dirent_js.go +++ b/internal/dirent/dirent_js.go @@ -6,22 +6,22 @@ import ( "unsafe" ) -func direntIno(buf []byte) (uint64, bool) { +func DirentIno(buf []byte) (uint64, bool) { return 1, true } -func direntReclen(buf []byte) (uint64, bool) { +func DirentReclen(buf []byte) (uint64, bool) { return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Reclen), unsafe.Sizeof(syscall.Dirent{}.Reclen)) } -func direntNamlen(buf []byte) (uint64, bool) { - reclen, ok := direntReclen(buf) +func DirentNamlen(buf []byte) (uint64, bool) { + reclen, ok := DirentReclen(buf) if !ok { return 0, false } return reclen - uint64(unsafe.Offsetof(syscall.Dirent{}.Name)), true } -func direntType(buf []byte) os.FileMode { +func DirentType(buf []byte) os.FileMode { return ^os.FileMode(0) // unknown } diff --git a/internal/dirent/dirent_linux.go b/internal/dirent/dirent_linux.go index c2f6865..e96d78f 100644 --- a/internal/dirent/dirent_linux.go +++ b/internal/dirent/dirent_linux.go @@ -8,23 +8,23 @@ import ( "unsafe" ) -func direntIno(buf []byte) (uint64, bool) { +func DirentIno(buf []byte) (uint64, bool) { return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Ino), unsafe.Sizeof(syscall.Dirent{}.Ino)) } -func direntReclen(buf []byte) (uint64, bool) { +func DirentReclen(buf []byte) (uint64, bool) { return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Reclen), unsafe.Sizeof(syscall.Dirent{}.Reclen)) } -func direntNamlen(buf []byte) (uint64, bool) { - reclen, ok := direntReclen(buf) +func DirentNamlen(buf []byte) (uint64, bool) { + reclen, ok := DirentReclen(buf) if !ok { return 0, false } return reclen - uint64(unsafe.Offsetof(syscall.Dirent{}.Name)), true } -func direntType(buf []byte) os.FileMode { +func DirentType(buf []byte) os.FileMode { off := unsafe.Offsetof(syscall.Dirent{}.Type) if off >= uintptr(len(buf)) { return ^os.FileMode(0) // unknown diff --git a/internal/dirent/dirent_netbsd.go b/internal/dirent/dirent_netbsd.go index 99dbe7b..56f4da6 100644 --- a/internal/dirent/dirent_netbsd.go +++ b/internal/dirent/dirent_netbsd.go @@ -8,19 +8,19 @@ import ( "unsafe" ) -func direntIno(buf []byte) (uint64, bool) { +func DirentIno(buf []byte) (uint64, bool) { return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Fileno), unsafe.Sizeof(syscall.Dirent{}.Fileno)) } -func direntReclen(buf []byte) (uint64, bool) { +func DirentReclen(buf []byte) (uint64, bool) { return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Reclen), unsafe.Sizeof(syscall.Dirent{}.Reclen)) } -func direntNamlen(buf []byte) (uint64, bool) { +func DirentNamlen(buf []byte) (uint64, bool) { return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Namlen), unsafe.Sizeof(syscall.Dirent{}.Namlen)) } -func direntType(buf []byte) os.FileMode { +func DirentType(buf []byte) os.FileMode { off := unsafe.Offsetof(syscall.Dirent{}.Type) if off >= uintptr(len(buf)) { return ^os.FileMode(0) // unknown diff --git a/internal/dirent/dirent_openbsd.go b/internal/dirent/dirent_openbsd.go index ffd48b5..979ea67 100644 --- a/internal/dirent/dirent_openbsd.go +++ b/internal/dirent/dirent_openbsd.go @@ -8,19 +8,19 @@ import ( "unsafe" ) -func direntIno(buf []byte) (uint64, bool) { +func DirentIno(buf []byte) (uint64, bool) { return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Fileno), unsafe.Sizeof(syscall.Dirent{}.Fileno)) } -func direntReclen(buf []byte) (uint64, bool) { +func DirentReclen(buf []byte) (uint64, bool) { return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Reclen), unsafe.Sizeof(syscall.Dirent{}.Reclen)) } -func direntNamlen(buf []byte) (uint64, bool) { +func DirentNamlen(buf []byte) (uint64, bool) { return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Namlen), unsafe.Sizeof(syscall.Dirent{}.Namlen)) } -func direntType(buf []byte) os.FileMode { +func DirentType(buf []byte) os.FileMode { off := unsafe.Offsetof(syscall.Dirent{}.Type) if off >= uintptr(len(buf)) { return ^os.FileMode(0) // unknown diff --git a/internal/dirent/dirent_solaris.go b/internal/dirent/dirent_solaris.go index 8ef23c1..7a8c3f9 100644 --- a/internal/dirent/dirent_solaris.go +++ b/internal/dirent/dirent_solaris.go @@ -8,22 +8,22 @@ import ( "unsafe" ) -func direntIno(buf []byte) (uint64, bool) { +func DirentIno(buf []byte) (uint64, bool) { return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Ino), unsafe.Sizeof(syscall.Dirent{}.Ino)) } -func direntReclen(buf []byte) (uint64, bool) { +func DirentReclen(buf []byte) (uint64, bool) { return readInt(buf, unsafe.Offsetof(syscall.Dirent{}.Reclen), unsafe.Sizeof(syscall.Dirent{}.Reclen)) } -func direntNamlen(buf []byte) (uint64, bool) { - reclen, ok := direntReclen(buf) +func DirentNamlen(buf []byte) (uint64, bool) { + reclen, ok := DirentReclen(buf) if !ok { return 0, false } return reclen - uint64(unsafe.Offsetof(syscall.Dirent{}.Name)), true } -func direntType(buf []byte) os.FileMode { +func DirentType(buf []byte) os.FileMode { return ^os.FileMode(0) // unknown }