Skip to content

Commit

Permalink
Make fileset take optional list of paths to list (#1684)
Browse files Browse the repository at this point in the history
## Changes

Before this change, the fileset library would take a single root path
and list all files in it. To support an allowlist of paths to list (much
like a Git `pathspec` without patterns; see [pathspec](pathspec)), this
change introduces an optional argument to `fileset.New` where the caller
can specify paths to list. If not specified, this argument defaults to
list `.` (i.e. list all files in the root).

The motivation for this change is that we wish to expose this pattern in
bundles. Users should be able to specify which paths to synchronize
instead of always only synchronizing the bundle root directory.

[pathspec]:
https://git-scm.com/docs/gitglossary#Documentation/gitglossary.txt-aiddefpathspecapathspec

## Tests

New and existing unit tests.
  • Loading branch information
pietern authored Aug 19, 2024
1 parent ab4e809 commit 7de7583
Show file tree
Hide file tree
Showing 16 changed files with 256 additions and 58 deletions.
2 changes: 1 addition & 1 deletion bundle/config/validate/validate_sync_patterns.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ func checkPatterns(patterns []string, path string, rb bundle.ReadOnlyBundle) (di
return err
}

all, err := fs.All()
all, err := fs.Files()
if err != nil {
return err
}
Expand Down
4 changes: 2 additions & 2 deletions bundle/deploy/state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ func TestFromSlice(t *testing.T) {
testutil.Touch(t, tmpDir, "test2.py")
testutil.Touch(t, tmpDir, "test3.py")

files, err := fileset.All()
files, err := fileset.Files()
require.NoError(t, err)

f, err := FromSlice(files)
Expand All @@ -38,7 +38,7 @@ func TestToSlice(t *testing.T) {
testutil.Touch(t, tmpDir, "test2.py")
testutil.Touch(t, tmpDir, "test3.py")

files, err := fileset.All()
files, err := fileset.Files()
require.NoError(t, err)

f, err := FromSlice(files)
Expand Down
2 changes: 1 addition & 1 deletion bundle/deploy/state_update_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ func setupBundleForStateUpdate(t *testing.T) *bundle.Bundle {
testutil.Touch(t, tmpDir, "test1.py")
testutil.TouchNotebook(t, tmpDir, "test2.py")

files, err := fileset.New(vfs.MustNew(tmpDir)).All()
files, err := fileset.New(vfs.MustNew(tmpDir)).Files()
require.NoError(t, err)

return &bundle.Bundle{
Expand Down
93 changes: 71 additions & 22 deletions libs/fileset/fileset.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,56 @@ package fileset
import (
"fmt"
"io/fs"
"os"
pathlib "path"
"path/filepath"
"slices"

"github.com/databricks/cli/libs/vfs"
)

// FileSet facilitates fast recursive file listing of a path.
// FileSet facilitates recursive file listing for paths rooted at a given directory.
// It optionally takes into account ignore rules through the [Ignorer] interface.
type FileSet struct {
// Root path of the fileset.
root vfs.Path

// Paths to include in the fileset.
// Files are included as-is (if not ignored) and directories are traversed recursively.
// Defaults to []string{"."} if not specified.
paths []string

// Ignorer interface to check if a file or directory should be ignored.
ignore Ignorer
}

// New returns a [FileSet] for the given root path.
func New(root vfs.Path) *FileSet {
// It optionally accepts a list of paths relative to the root to include in the fileset.
// If not specified, it defaults to including all files in the root path.
func New(root vfs.Path, args ...[]string) *FileSet {
// Default to including all files in the root path.
if len(args) == 0 {
args = [][]string{{"."}}
}

// Collect list of normalized and cleaned paths.
var paths []string
for _, arg := range args {
for _, path := range arg {
path = filepath.ToSlash(path)
path = pathlib.Clean(path)

// Skip path if it's already in the list.
if slices.Contains(paths, path) {
continue
}

paths = append(paths, path)
}
}

return &FileSet{
root: root,
paths: paths,
ignore: nopIgnorer{},
}
}
Expand All @@ -36,49 +67,67 @@ func (w *FileSet) SetIgnorer(ignore Ignorer) {
w.ignore = ignore
}

// Return all tracked files for Repo
func (w *FileSet) All() ([]File, error) {
return w.recursiveListFiles()
// Files returns performs recursive listing on all configured paths and returns
// the collection of files it finds (and are not ignored).
// The returned slice does not contain duplicates.
// The order of files in the slice is stable.
func (w *FileSet) Files() (out []File, err error) {
seen := make(map[string]struct{})
for _, p := range w.paths {
files, err := w.recursiveListFiles(p, seen)
if err != nil {
return nil, err
}
out = append(out, files...)
}
return out, nil
}

// Recursively traverses dir in a depth first manner and returns a list of all files
// that are being tracked in the FileSet (ie not being ignored for matching one of the
// patterns in w.ignore)
func (w *FileSet) recursiveListFiles() (fileList []File, err error) {
err = fs.WalkDir(w.root, ".", func(name string, d fs.DirEntry, err error) error {
func (w *FileSet) recursiveListFiles(path string, seen map[string]struct{}) (out []File, err error) {
err = fs.WalkDir(w.root, path, func(name string, d fs.DirEntry, err error) error {
if err != nil {
return err
}

// skip symlinks
info, err := d.Info()
if err != nil {
return err
}
if info.Mode()&os.ModeSymlink != 0 {
return nil
}

if d.IsDir() {
switch {
case info.Mode().IsDir():
ign, err := w.ignore.IgnoreDirectory(name)
if err != nil {
return fmt.Errorf("cannot check if %s should be ignored: %w", name, err)
}
if ign {
return fs.SkipDir
}
return nil
}

ign, err := w.ignore.IgnoreFile(name)
if err != nil {
return fmt.Errorf("cannot check if %s should be ignored: %w", name, err)
}
if ign {
return nil
case info.Mode().IsRegular():
ign, err := w.ignore.IgnoreFile(name)
if err != nil {
return fmt.Errorf("cannot check if %s should be ignored: %w", name, err)
}
if ign {
return nil
}

// Skip duplicates
if _, ok := seen[name]; ok {
return nil
}

seen[name] = struct{}{}
out = append(out, NewFile(w.root, d, name))

default:
// Skip non-regular files (e.g. symlinks).
}

fileList = append(fileList, NewFile(w.root, d, name))
return nil
})
return
Expand Down
144 changes: 144 additions & 0 deletions libs/fileset/fileset_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
package fileset

import (
"errors"
"testing"

"github.com/databricks/cli/libs/vfs"
"github.com/stretchr/testify/assert"
)

func TestFileSet_NoPaths(t *testing.T) {
fs := New(vfs.MustNew("testdata"))
files, err := fs.Files()
if !assert.NoError(t, err) {
return
}

assert.Len(t, files, 4)
assert.Equal(t, "dir1/a", files[0].Relative)
assert.Equal(t, "dir1/b", files[1].Relative)
assert.Equal(t, "dir2/a", files[2].Relative)
assert.Equal(t, "dir2/b", files[3].Relative)
}

func TestFileSet_ParentPath(t *testing.T) {
fs := New(vfs.MustNew("testdata"), []string{".."})
_, err := fs.Files()

// It is impossible to escape the root directory.
assert.Error(t, err)
}

func TestFileSet_DuplicatePaths(t *testing.T) {
fs := New(vfs.MustNew("testdata"), []string{"dir1", "dir1"})
files, err := fs.Files()
if !assert.NoError(t, err) {
return
}

assert.Len(t, files, 2)
assert.Equal(t, "dir1/a", files[0].Relative)
assert.Equal(t, "dir1/b", files[1].Relative)
}

func TestFileSet_OverlappingPaths(t *testing.T) {
fs := New(vfs.MustNew("testdata"), []string{"dir1", "dir1/a"})
files, err := fs.Files()
if !assert.NoError(t, err) {
return
}

assert.Len(t, files, 2)
assert.Equal(t, "dir1/a", files[0].Relative)
assert.Equal(t, "dir1/b", files[1].Relative)
}

func TestFileSet_IgnoreDirError(t *testing.T) {
testError := errors.New("test error")
fs := New(vfs.MustNew("testdata"))
fs.SetIgnorer(testIgnorer{dirErr: testError})
_, err := fs.Files()
assert.ErrorIs(t, err, testError)
}

func TestFileSet_IgnoreDir(t *testing.T) {
fs := New(vfs.MustNew("testdata"))
fs.SetIgnorer(testIgnorer{dir: []string{"dir1"}})
files, err := fs.Files()
if !assert.NoError(t, err) {
return
}

assert.Len(t, files, 2)
assert.Equal(t, "dir2/a", files[0].Relative)
assert.Equal(t, "dir2/b", files[1].Relative)
}

func TestFileSet_IgnoreFileError(t *testing.T) {
testError := errors.New("test error")
fs := New(vfs.MustNew("testdata"))
fs.SetIgnorer(testIgnorer{fileErr: testError})
_, err := fs.Files()
assert.ErrorIs(t, err, testError)
}

func TestFileSet_IgnoreFile(t *testing.T) {
fs := New(vfs.MustNew("testdata"))
fs.SetIgnorer(testIgnorer{file: []string{"dir1/a"}})
files, err := fs.Files()
if !assert.NoError(t, err) {
return
}

assert.Len(t, files, 3)
assert.Equal(t, "dir1/b", files[0].Relative)
assert.Equal(t, "dir2/a", files[1].Relative)
assert.Equal(t, "dir2/b", files[2].Relative)
}

type testIgnorer struct {
// dir is a list of directories to ignore. Strings are compared verbatim.
dir []string

// dirErr is an error to return when IgnoreDirectory is called.
dirErr error

// file is a list of files to ignore. Strings are compared verbatim.
file []string

// fileErr is an error to return when IgnoreFile is called.
fileErr error
}

// IgnoreDirectory returns true if the path is in the dir list.
// If dirErr is set, it returns dirErr.
func (t testIgnorer) IgnoreDirectory(path string) (bool, error) {
if t.dirErr != nil {
return false, t.dirErr
}

for _, d := range t.dir {
if d == path {
return true, nil
}
}

return false, nil
}

// IgnoreFile returns true if the path is in the file list.
// If fileErr is set, it returns fileErr.
func (t testIgnorer) IgnoreFile(path string) (bool, error) {
if t.fileErr != nil {
return false, t.fileErr
}

for _, f := range t.file {
if f == path {
return true, nil
}
}

return false, nil
}
18 changes: 11 additions & 7 deletions libs/fileset/glob_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,19 @@ func TestGlobFileset(t *testing.T) {
entries, err := root.ReadDir(".")
require.NoError(t, err)

// Remove testdata folder from entries
entries = slices.DeleteFunc(entries, func(de fs.DirEntry) bool {
return de.Name() == "testdata"
})

g, err := NewGlobSet(root, []string{
"./*.go",
})
require.NoError(t, err)

files, err := g.All()
files, err := g.Files()
require.NoError(t, err)

// +1 as there's one folder in ../filer
require.Equal(t, len(files), len(entries))
for _, f := range files {
exists := slices.ContainsFunc(entries, func(de fs.DirEntry) bool {
Expand All @@ -46,7 +50,7 @@ func TestGlobFileset(t *testing.T) {
})
require.NoError(t, err)

files, err = g.All()
files, err = g.Files()
require.NoError(t, err)
require.Equal(t, len(files), 0)
}
Expand All @@ -61,7 +65,7 @@ func TestGlobFilesetWithRelativeRoot(t *testing.T) {
})
require.NoError(t, err)

files, err := g.All()
files, err := g.Files()
require.NoError(t, err)
require.Equal(t, len(files), len(entries))
}
Expand All @@ -82,7 +86,7 @@ func TestGlobFilesetRecursively(t *testing.T) {
})
require.NoError(t, err)

files, err := g.All()
files, err := g.Files()
require.NoError(t, err)
require.ElementsMatch(t, entries, collectRelativePaths(files))
}
Expand All @@ -103,7 +107,7 @@ func TestGlobFilesetDir(t *testing.T) {
})
require.NoError(t, err)

files, err := g.All()
files, err := g.Files()
require.NoError(t, err)
require.ElementsMatch(t, entries, collectRelativePaths(files))
}
Expand All @@ -124,7 +128,7 @@ func TestGlobFilesetDoubleQuotesWithFilePatterns(t *testing.T) {
})
require.NoError(t, err)

files, err := g.All()
files, err := g.Files()
require.NoError(t, err)
require.ElementsMatch(t, entries, collectRelativePaths(files))
}
Empty file added libs/fileset/testdata/dir1/a
Empty file.
Empty file added libs/fileset/testdata/dir1/b
Empty file.
Empty file added libs/fileset/testdata/dir2/a
Empty file.
Empty file added libs/fileset/testdata/dir2/b
Empty file.
1 change: 1 addition & 0 deletions libs/fileset/testdata/dir3/a
Loading

0 comments on commit 7de7583

Please sign in to comment.