Skip to content

Commit

Permalink
Added filterDirectoriesDuringWorkspaceWalking option
Browse files Browse the repository at this point in the history
  • Loading branch information
alexott committed Oct 31, 2024
1 parent 1b42338 commit ea3d1f4
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 40 deletions.
1 change: 1 addition & 0 deletions docs/guides/experimental-exporter.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ All arguments are optional, and they tune what code is being generated.
* `-match` - Match resource names during listing operation. This filter applies to all resources that are getting listed, so if you want to import all dependencies of just one cluster, specify `-match=autoscaling -listing=compute`. By default, it is empty, which matches everything.
* `-matchRegex` - Match resource names against a given regex during listing operation. Applicable to all resources selected for listing.
* `-excludeRegex` - Exclude resource names matching a given regex. Applied during the listing operation and has higher priority than `-match` and `-matchRegex`. Applicable to all resources selected for listing. Could be used to exclude things like `databricks_automl` notebooks, etc.
* `filterDirectoriesDuringWorkspaceWalking` - if we should apply match logic to directory names when we're performing workspace tree walking. *Note: be careful with it as it will be applied to all entries, so if you want to filter only specific users, then you will need to specify condition for `/Users` as well, so regex will be `^(/Users|/Users/[a-c].*)$`
* `-mounts` - List DBFS mount points, an extremely slow operation that would not trigger unless explicitly specified.
* `-generateProviderDeclaration` - the flag that toggles the generation of `databricks.tf` file with the declaration of the Databricks Terraform provider that is necessary for Terraform versions since Terraform 0.13 (disabled by default).
* `-prefix` - optional prefix that will be added to the name of all exported resources - that's useful for exporting resources from multiple workspaces for merging into a single one.
Expand Down
2 changes: 2 additions & 0 deletions exporter/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ func Run(args ...string) error {
flags.BoolVar(&ic.mounts, "mounts", false, "List DBFS mount points.")
flags.BoolVar(&ic.generateDeclaration, "generateProviderDeclaration", true,
"Generate Databricks provider declaration.")
flags.BoolVar(&ic.filterDirectoriesDuringWorkspaceWalking, "filterDirectoriesDuringWorkspaceWalking", false,
"Apply filtering to directory names during workspace walking")
flags.StringVar(&ic.notebooksFormat, "notebooksFormat", "SOURCE",
"Format to export notebooks: SOURCE, DBC, JUPYTER. Default: SOURCE")
services, listing := ic.allServicesAndListing()
Expand Down
53 changes: 27 additions & 26 deletions exporter/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,32 +78,33 @@ type importContext struct {
Scope importedResources

// command-line resources (immutable, or set by the single thread)
includeUserDomains bool
importAllUsers bool
exportDeletedUsersAssets bool
incremental bool
mounts bool
noFormat bool
nativeImportSupported bool
services map[string]struct{}
listing map[string]struct{}
match string
matchRegexStr string
matchRegex *regexp.Regexp
excludeRegexStr string
excludeRegex *regexp.Regexp
lastActiveDays int64
lastActiveMs int64
generateDeclaration bool
exportSecrets bool
meAdmin bool
meUserName string
prefix string
accountLevel bool
shImports map[string]bool
notebooksFormat string
updatedSinceStr string
updatedSinceMs int64
includeUserDomains bool
importAllUsers bool
exportDeletedUsersAssets bool
incremental bool
mounts bool
noFormat bool
nativeImportSupported bool
services map[string]struct{}
listing map[string]struct{}
match string
matchRegexStr string
matchRegex *regexp.Regexp
excludeRegexStr string
excludeRegex *regexp.Regexp
filterDirectoriesDuringWorkspaceWalking bool
lastActiveDays int64
lastActiveMs int64
generateDeclaration bool
exportSecrets bool
meAdmin bool
meUserName string
prefix string
accountLevel bool
shImports map[string]bool
notebooksFormat string
updatedSinceStr string
updatedSinceMs int64

waitGroup *sync.WaitGroup

Expand Down
115 changes: 111 additions & 4 deletions exporter/exporter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2349,7 +2349,7 @@ func TestImportingGlobalSqlConfig(t *testing.T) {
})
}

func TestImportingNotebooksWorkspaceFiles(t *testing.T) {
func TestImportingNotebooksWorkspaceFilesWithFilter(t *testing.T) {
fileStatus := workspace.ObjectStatus{
ObjectID: 123,
ObjectType: workspace.File,
Expand All @@ -2372,22 +2372,27 @@ func TestImportingNotebooksWorkspaceFiles(t *testing.T) {
Resource: "/api/2.0/workspace/list?path=%2F",
Response: workspace.ObjectList{
Objects: []workspace.ObjectStatus{notebookStatus, fileStatus,
workspace.ObjectStatus{
{
ObjectID: 4567,
ObjectType: workspace.Notebook,
Path: "/UnmatchedNotebook",
Language: "PYTHON",
},
workspace.ObjectStatus{
{
ObjectID: 1234,
ObjectType: workspace.File,
Path: "/UnmatchedFile",
},
workspace.ObjectStatus{
{
ObjectID: 456,
ObjectType: workspace.Directory,
Path: "/databricks_automl",
},
{
ObjectID: 456,
ObjectType: workspace.Directory,
Path: "/.bundle",
},
},
},
ReuseRequest: true,
Expand Down Expand Up @@ -2455,6 +2460,108 @@ func TestImportingNotebooksWorkspaceFiles(t *testing.T) {
})
}

func TestImportingNotebooksWorkspaceFilesWithFilterDuringWalking(t *testing.T) {
fileStatus := workspace.ObjectStatus{
ObjectID: 123,
ObjectType: workspace.File,
Path: "/File",
}
notebookStatus := workspace.ObjectStatus{
ObjectID: 456,
ObjectType: workspace.Notebook,
Path: "/Notebook",
Language: "PYTHON",
}
qa.HTTPFixturesApply(t,
[]qa.HTTPFixture{
meAdminFixture,
noCurrentMetastoreAttached,
emptyRepos,
emptyIpAccessLIst,
{
Method: "GET",
Resource: "/api/2.0/workspace/list?path=%2F",
Response: workspace.ObjectList{
Objects: []workspace.ObjectStatus{notebookStatus, fileStatus,
{
ObjectID: 4567,
ObjectType: workspace.Notebook,
Path: "/UnmatchedNotebook",
Language: "PYTHON",
},
{
ObjectID: 1234,
ObjectType: workspace.File,
Path: "/UnmatchedFile",
},
{
ObjectID: 456,
ObjectType: workspace.Directory,
Path: "/databricks_automl",
},
},
},
ReuseRequest: true,
},
{
Method: "GET",
Resource: "/api/2.0/workspace/get-status?path=%2FNotebook",
Response: notebookStatus,
ReuseRequest: true,
},
{
Method: "GET",
Resource: "/api/2.0/workspace/get-status?path=%2FFile",
Response: fileStatus,
ReuseRequest: true,
},
{
Method: "GET",
Resource: "/api/2.0/workspace/export?format=AUTO&path=%2FFile",
Response: workspace.ExportPath{
Content: "dGVzdA==",
},
ReuseRequest: true,
},
{
Method: "GET",
Resource: "/api/2.0/workspace/export?format=SOURCE&path=%2FNotebook",
Response: workspace.ExportPath{
Content: "dGVzdA==",
},
ReuseRequest: true,
},
},
func(ctx context.Context, client *common.DatabricksClient) {
tmpDir := fmt.Sprintf("/tmp/tf-%s", qa.RandomName())
defer os.RemoveAll(tmpDir)

ic := newImportContext(client)
ic.Directory = tmpDir
ic.enableListing("notebooks,wsfiles")
ic.excludeRegexStr = "databricks_automl"
ic.matchRegexStr = "^/[FN].*$"
ic.filterDirectoriesDuringWorkspaceWalking = true

err := ic.Run()
assert.NoError(t, err)
// check generated code for notebooks
content, err := os.ReadFile(tmpDir + "/notebooks.tf")
assert.NoError(t, err)
contentStr := string(content)
assert.True(t, strings.Contains(contentStr, `resource "databricks_notebook" "notebook_456"`))
assert.True(t, strings.Contains(contentStr, `path = "/Notebook"`))
assert.False(t, strings.Contains(contentStr, `/UnmatchedNotebook`))
// check generated code for workspace files
content, err = os.ReadFile(tmpDir + "/wsfiles.tf")
assert.NoError(t, err)
contentStr = string(content)
assert.True(t, strings.Contains(contentStr, `resource "databricks_workspace_file" "file_123"`))
assert.True(t, strings.Contains(contentStr, `path = "/File"`))
assert.False(t, strings.Contains(contentStr, `/UnmatchedFile`))
})
}

func TestImportingModelServing(t *testing.T) {
qa.HTTPFixturesApply(t,
[]qa.HTTPFixture{
Expand Down
12 changes: 6 additions & 6 deletions exporter/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -316,16 +316,16 @@ func TestGetEnvAsInt(t *testing.T) {
}

func TestExcludeAuxiliaryDirectories(t *testing.T) {
assert.True(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "", ObjectType: workspace.Directory}))
assert.True(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{ObjectType: workspace.File}))
assert.True(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Users/[email protected]/abc",
assert.False(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "", ObjectType: workspace.Directory}))
assert.False(t, isAuxiliaryDirectory(workspace.ObjectStatus{ObjectType: workspace.File}))
assert.False(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Users/[email protected]/abc",
ObjectType: workspace.Directory}))
// should be ignored
assert.False(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Users/[email protected]/.ide",
assert.True(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Users/[email protected]/.ide",
ObjectType: workspace.Directory}))
assert.False(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Shared/.bundle",
assert.True(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Shared/.bundle",
ObjectType: workspace.Directory}))
assert.False(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Users/[email protected]/abc/__pycache__",
assert.True(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Users/[email protected]/abc/__pycache__",
ObjectType: workspace.Directory}))
}

Expand Down
17 changes: 13 additions & 4 deletions exporter/util_workspace.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,17 +93,18 @@ func (ic *importContext) getAllDirectories() []workspace.ObjectStatus {
var directoriesToIgnore = []string{".ide", ".bundle", "__pycache__"}

// TODO: add ignoring directories of deleted users? This could potentially decrease the number of processed objects...
func excludeAuxiliaryDirectories(v workspace.ObjectStatus) bool {
func isAuxiliaryDirectory(v workspace.ObjectStatus) bool {
if v.ObjectType != workspace.Directory {
return true
return false
}
// TODO: rewrite to use suffix check, etc., instead of split and slice contains?
parts := strings.Split(v.Path, "/")
result := len(parts) > 1 && slices.Contains[[]string, string](directoriesToIgnore, parts[len(parts)-1])
log.Printf("[DEBUG] directory %s: %v", v.Path, result)
if result {
log.Printf("[DEBUG] Ignoring directory %s", v.Path)
}
return !result
return result
}

func (ic *importContext) getAllWorkspaceObjects(visitor func([]workspace.ObjectStatus)) []workspace.ObjectStatus {
Expand All @@ -113,7 +114,15 @@ func (ic *importContext) getAllWorkspaceObjects(visitor func([]workspace.ObjectS
t1 := time.Now()
log.Print("[INFO] Starting to list all workspace objects")
notebooksAPI := workspace.NewNotebooksAPI(ic.Context, ic.Client)
ic.allWorkspaceObjects, _ = ListParallel(notebooksAPI, "/", excludeAuxiliaryDirectories, visitor)
shouldIncludeDirectory := func(v workspace.ObjectStatus) bool {
decision := !isAuxiliaryDirectory(v)
if decision && ic.filterDirectoriesDuringWorkspaceWalking {
decision = ic.MatchesName(v.Path)
}
// log.Printf("[DEBUG] decision of shouldIncludeDirectory for %s: %v", v.Path, decision)
return decision
}
ic.allWorkspaceObjects, _ = ListParallel(notebooksAPI, "/", shouldIncludeDirectory, visitor)
log.Printf("[INFO] Finished listing of all workspace objects. %d objects in total. %v seconds",
len(ic.allWorkspaceObjects), time.Since(t1).Seconds())
}
Expand Down

0 comments on commit ea3d1f4

Please sign in to comment.