Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Exporter] Allow to match resource names by regular expression #4177

Merged
merged 4 commits into from
Nov 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/guides/experimental-exporter.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ All arguments are optional, and they tune what code is being generated.
* `-listing` - Comma-separated list of services to be listed and further passed on for importing. For each service specified, the exporter performs a listing of available resources using the `List` function and emits them for importing together with their dependencies. The `-services` parameter could be used to control which transitive dependencies will be also imported.
* `-services` - Comma-separated list of services to import. By default, all services are imported.
* `-match` - Match resource names during listing operation. This filter applies to all resources that are getting listed, so if you want to import all dependencies of just one cluster, specify `-match=autoscaling -listing=compute`. By default, it is empty, which matches everything.
* `-matchRegex` - Match resource names against a given regex during listing operation. Applicable to all resources selected for listing.
* `-excludeRegex` - Exclude resource names matching a given regex. Applied during the listing operation and has higher priority than `-match` and `-matchRegex`. Applicable to all resources selected for listing. Could be used to exclude things like `databricks_automl` notebooks, etc.
* `-filterDirectoriesDuringWorkspaceWalking` - if we should apply match logic to directory names when we're performing workspace tree walking. *Note: be careful with it as it will be applied to all entries, so if you want to filter only specific users, then you will need to specify condition for `/Users` as well, so regex will be `^(/Users|/Users/[a-c].*)$`*.
* `-mounts` - List DBFS mount points, an extremely slow operation that would not trigger unless explicitly specified.
* `-generateProviderDeclaration` - the flag that toggles the generation of `databricks.tf` file with the declaration of the Databricks Terraform provider that is necessary for Terraform versions since Terraform 0.13 (disabled by default).
* `-prefix` - optional prefix that will be added to the name of all exported resources - that's useful for exporting resources from multiple workspaces for merging into a single one.
Expand Down
8 changes: 8 additions & 0 deletions exporter/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ func Run(args ...string) error {
flags.BoolVar(&ic.mounts, "mounts", false, "List DBFS mount points.")
flags.BoolVar(&ic.generateDeclaration, "generateProviderDeclaration", true,
"Generate Databricks provider declaration.")
flags.BoolVar(&ic.filterDirectoriesDuringWorkspaceWalking, "filterDirectoriesDuringWorkspaceWalking", false,
"Apply filtering to directory names during workspace walking")
flags.StringVar(&ic.notebooksFormat, "notebooksFormat", "SOURCE",
"Format to export notebooks: SOURCE, DBC, JUPYTER. Default: SOURCE")
services, listing := ic.allServicesAndListing()
Expand All @@ -145,6 +147,12 @@ func Run(args ...string) error {
flags.StringVar(&ic.match, "match", "", "Match resource names during listing operation. "+
"This filter applies to all resources that are getting listed, so if you want to import "+
"all dependencies of just one cluster, specify -listing=compute")
flags.StringVar(&ic.matchRegexStr, "matchRegex", "", "Match resource names during listing operation against a regex. "+
"This filter applies to all resources that are getting listed, so if you want to import "+
"all dependencies of just one cluster, specify -listing=compute")
flags.StringVar(&ic.excludeRegexStr, "excludeRegex", "", "Exclude resource names matching regex during listing operation. "+
"This filter applies to all resources that are getting listed, so if you want to import "+
"all dependencies of just one cluster, specify -listing=compute")
prefix := ""
flags.StringVar(&prefix, "prefix", "", "Prefix that will be added to the name of all exported resources")
newArgs := args
Expand Down
67 changes: 45 additions & 22 deletions exporter/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,28 +78,33 @@ type importContext struct {
Scope importedResources

// command-line resources (immutable, or set by the single thread)
includeUserDomains bool
importAllUsers bool
exportDeletedUsersAssets bool
incremental bool
mounts bool
noFormat bool
nativeImportSupported bool
services map[string]struct{}
listing map[string]struct{}
match string
lastActiveDays int64
lastActiveMs int64
generateDeclaration bool
exportSecrets bool
meAdmin bool
meUserName string
prefix string
accountLevel bool
shImports map[string]bool
notebooksFormat string
updatedSinceStr string
updatedSinceMs int64
includeUserDomains bool
importAllUsers bool
exportDeletedUsersAssets bool
incremental bool
mounts bool
noFormat bool
nativeImportSupported bool
services map[string]struct{}
listing map[string]struct{}
match string
matchRegexStr string
matchRegex *regexp.Regexp
excludeRegexStr string
excludeRegex *regexp.Regexp
filterDirectoriesDuringWorkspaceWalking bool
lastActiveDays int64
lastActiveMs int64
generateDeclaration bool
exportSecrets bool
meAdmin bool
meUserName string
prefix string
accountLevel bool
shImports map[string]bool
notebooksFormat string
updatedSinceStr string
updatedSinceMs int64

waitGroup *sync.WaitGroup

Expand Down Expand Up @@ -297,6 +302,24 @@ func (ic *importContext) Run() error {
return fmt.Errorf("no services to import")
}

if ic.matchRegexStr != "" {
log.Printf("[DEBUG] Using regex '%s' to filter resources", ic.matchRegexStr)
re, err := regexp.Compile(ic.matchRegexStr)
if err != nil {
log.Printf("[ERROR] can't compile regex '%s': %v", ic.matchRegexStr, err)
return err
}
ic.matchRegex = re
}
if ic.excludeRegexStr != "" {
log.Printf("[DEBUG] Using regex '%s' to filter resources", ic.excludeRegexStr)
re, err := regexp.Compile(ic.excludeRegexStr)
if err != nil {
log.Printf("[ERROR] can't compile regex '%s': %v", ic.excludeRegexStr, err)
return err
}
ic.excludeRegex = re
}
if ic.incremental {
if ic.updatedSinceStr == "" {
ic.updatedSinceStr = getLastRunString(statsFileName)
Expand Down
151 changes: 148 additions & 3 deletions exporter/exporter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2349,7 +2349,7 @@ func TestImportingGlobalSqlConfig(t *testing.T) {
})
}

func TestImportingNotebooksWorkspaceFiles(t *testing.T) {
func TestImportingNotebooksWorkspaceFilesWithFilter(t *testing.T) {
fileStatus := workspace.ObjectStatus{
ObjectID: 123,
ObjectType: workspace.File,
Expand All @@ -2371,7 +2371,135 @@ func TestImportingNotebooksWorkspaceFiles(t *testing.T) {
Method: "GET",
Resource: "/api/2.0/workspace/list?path=%2F",
Response: workspace.ObjectList{
Objects: []workspace.ObjectStatus{notebookStatus, fileStatus},
Objects: []workspace.ObjectStatus{notebookStatus, fileStatus,
{
ObjectID: 4567,
ObjectType: workspace.Notebook,
Path: "/UnmatchedNotebook",
Language: "PYTHON",
},
{
ObjectID: 1234,
ObjectType: workspace.File,
Path: "/UnmatchedFile",
},
{
ObjectID: 456,
ObjectType: workspace.Directory,
Path: "/databricks_automl",
},
{
ObjectID: 456,
ObjectType: workspace.Directory,
Path: "/.bundle",
},
},
},
ReuseRequest: true,
},
{
Method: "GET",
Resource: "/api/2.0/workspace/list?path=%2Fdatabricks_automl",
Response: workspace.ObjectList{},
},
{
Method: "GET",
Resource: "/api/2.0/workspace/get-status?path=%2FNotebook",
Response: notebookStatus,
ReuseRequest: true,
},
{
Method: "GET",
Resource: "/api/2.0/workspace/get-status?path=%2FFile",
Response: fileStatus,
ReuseRequest: true,
},
{
Method: "GET",
Resource: "/api/2.0/workspace/export?format=AUTO&path=%2FFile",
Response: workspace.ExportPath{
Content: "dGVzdA==",
},
ReuseRequest: true,
},
{
Method: "GET",
Resource: "/api/2.0/workspace/export?format=SOURCE&path=%2FNotebook",
Response: workspace.ExportPath{
Content: "dGVzdA==",
},
ReuseRequest: true,
},
},
func(ctx context.Context, client *common.DatabricksClient) {
tmpDir := fmt.Sprintf("/tmp/tf-%s", qa.RandomName())
defer os.RemoveAll(tmpDir)

ic := newImportContext(client)
ic.Directory = tmpDir
ic.enableListing("notebooks,wsfiles")
ic.excludeRegexStr = "databricks_automl"
ic.matchRegexStr = "^/[FN].*$"

err := ic.Run()
assert.NoError(t, err)
// check generated code for notebooks
content, err := os.ReadFile(tmpDir + "/notebooks.tf")
assert.NoError(t, err)
contentStr := string(content)
assert.True(t, strings.Contains(contentStr, `resource "databricks_notebook" "notebook_456"`))
assert.True(t, strings.Contains(contentStr, `path = "/Notebook"`))
assert.False(t, strings.Contains(contentStr, `/UnmatchedNotebook`))
// check generated code for workspace files
content, err = os.ReadFile(tmpDir + "/wsfiles.tf")
assert.NoError(t, err)
contentStr = string(content)
assert.True(t, strings.Contains(contentStr, `resource "databricks_workspace_file" "file_123"`))
assert.True(t, strings.Contains(contentStr, `path = "/File"`))
assert.False(t, strings.Contains(contentStr, `/UnmatchedFile`))
})
}

func TestImportingNotebooksWorkspaceFilesWithFilterDuringWalking(t *testing.T) {
fileStatus := workspace.ObjectStatus{
ObjectID: 123,
ObjectType: workspace.File,
Path: "/File",
}
notebookStatus := workspace.ObjectStatus{
ObjectID: 456,
ObjectType: workspace.Notebook,
Path: "/Notebook",
Language: "PYTHON",
}
qa.HTTPFixturesApply(t,
[]qa.HTTPFixture{
meAdminFixture,
noCurrentMetastoreAttached,
emptyRepos,
emptyIpAccessLIst,
{
Method: "GET",
Resource: "/api/2.0/workspace/list?path=%2F",
Response: workspace.ObjectList{
Objects: []workspace.ObjectStatus{notebookStatus, fileStatus,
{
ObjectID: 4567,
ObjectType: workspace.Notebook,
Path: "/UnmatchedNotebook",
Language: "PYTHON",
},
{
ObjectID: 1234,
ObjectType: workspace.File,
Path: "/UnmatchedFile",
},
{
ObjectID: 456,
ObjectType: workspace.Directory,
Path: "/databricks_automl",
},
},
},
ReuseRequest: true,
},
Expand Down Expand Up @@ -2410,10 +2538,27 @@ func TestImportingNotebooksWorkspaceFiles(t *testing.T) {

ic := newImportContext(client)
ic.Directory = tmpDir
ic.enableListing("notebooks")
ic.enableListing("notebooks,wsfiles")
ic.excludeRegexStr = "databricks_automl"
ic.matchRegexStr = "^/[FN].*$"
ic.filterDirectoriesDuringWorkspaceWalking = true

err := ic.Run()
assert.NoError(t, err)
// check generated code for notebooks
content, err := os.ReadFile(tmpDir + "/notebooks.tf")
assert.NoError(t, err)
contentStr := string(content)
assert.True(t, strings.Contains(contentStr, `resource "databricks_notebook" "notebook_456"`))
assert.True(t, strings.Contains(contentStr, `path = "/Notebook"`))
assert.False(t, strings.Contains(contentStr, `/UnmatchedNotebook`))
// check generated code for workspace files
content, err = os.ReadFile(tmpDir + "/wsfiles.tf")
assert.NoError(t, err)
contentStr = string(content)
assert.True(t, strings.Contains(contentStr, `resource "databricks_workspace_file" "file_123"`))
assert.True(t, strings.Contains(contentStr, `path = "/File"`))
assert.False(t, strings.Contains(contentStr, `/UnmatchedFile`))
})
}

Expand Down
8 changes: 7 additions & 1 deletion exporter/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,15 @@ func (ic *importContext) isServiceInListing(service string) bool {
}

func (ic *importContext) MatchesName(n string) bool {
if ic.match == "" {
if ic.match == "" && ic.matchRegex == nil && ic.excludeRegex == nil {
return true
}
if ic.excludeRegex != nil && ic.excludeRegex.MatchString(n) {
return false
}
if ic.matchRegex != nil {
return ic.matchRegex.MatchString(n)
}
return strings.Contains(strings.ToLower(n), strings.ToLower(ic.match))
}

Expand Down
12 changes: 6 additions & 6 deletions exporter/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -316,16 +316,16 @@ func TestGetEnvAsInt(t *testing.T) {
}

func TestExcludeAuxiliaryDirectories(t *testing.T) {
assert.True(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "", ObjectType: workspace.Directory}))
assert.True(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{ObjectType: workspace.File}))
assert.True(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Users/[email protected]/abc",
assert.False(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "", ObjectType: workspace.Directory}))
assert.False(t, isAuxiliaryDirectory(workspace.ObjectStatus{ObjectType: workspace.File}))
assert.False(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Users/[email protected]/abc",
ObjectType: workspace.Directory}))
// should be ignored
assert.False(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Users/[email protected]/.ide",
assert.True(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Users/[email protected]/.ide",
ObjectType: workspace.Directory}))
assert.False(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Shared/.bundle",
assert.True(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Shared/.bundle",
ObjectType: workspace.Directory}))
assert.False(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Users/[email protected]/abc/__pycache__",
assert.True(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Users/[email protected]/abc/__pycache__",
ObjectType: workspace.Directory}))
}

Expand Down
17 changes: 13 additions & 4 deletions exporter/util_workspace.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,17 +93,18 @@ func (ic *importContext) getAllDirectories() []workspace.ObjectStatus {
var directoriesToIgnore = []string{".ide", ".bundle", "__pycache__"}

// TODO: add ignoring directories of deleted users? This could potentially decrease the number of processed objects...
func excludeAuxiliaryDirectories(v workspace.ObjectStatus) bool {
func isAuxiliaryDirectory(v workspace.ObjectStatus) bool {
if v.ObjectType != workspace.Directory {
return true
return false
}
// TODO: rewrite to use suffix check, etc., instead of split and slice contains?
parts := strings.Split(v.Path, "/")
result := len(parts) > 1 && slices.Contains[[]string, string](directoriesToIgnore, parts[len(parts)-1])
log.Printf("[DEBUG] directory %s: %v", v.Path, result)
if result {
log.Printf("[DEBUG] Ignoring directory %s", v.Path)
}
return !result
return result
}

func (ic *importContext) getAllWorkspaceObjects(visitor func([]workspace.ObjectStatus)) []workspace.ObjectStatus {
Expand All @@ -113,7 +114,15 @@ func (ic *importContext) getAllWorkspaceObjects(visitor func([]workspace.ObjectS
t1 := time.Now()
log.Print("[INFO] Starting to list all workspace objects")
notebooksAPI := workspace.NewNotebooksAPI(ic.Context, ic.Client)
ic.allWorkspaceObjects, _ = ListParallel(notebooksAPI, "/", excludeAuxiliaryDirectories, visitor)
shouldIncludeDirectory := func(v workspace.ObjectStatus) bool {
decision := !isAuxiliaryDirectory(v)
if decision && ic.filterDirectoriesDuringWorkspaceWalking {
decision = ic.MatchesName(v.Path)
}
// log.Printf("[DEBUG] decision of shouldIncludeDirectory for %s: %v", v.Path, decision)
return decision
}
ic.allWorkspaceObjects, _ = ListParallel(notebooksAPI, "/", shouldIncludeDirectory, visitor)
log.Printf("[INFO] Finished listing of all workspace objects. %d objects in total. %v seconds",
len(ic.allWorkspaceObjects), time.Since(t1).Seconds())
}
Expand Down
Loading