From 82ba90296d72e81c77e0cac78891b03967de65ef Mon Sep 17 00:00:00 2001 From: consolethinks Date: Tue, 23 Jul 2024 17:42:24 +0200 Subject: [PATCH 1/5] testForExistingSourceFolder refactor --- cmd/commands/datasetIngestor.go | 36 ++++++-- .../testForExistingSourceFolder.go | 91 ++++++------------- .../testForExistingSourceFolder_test.go | 31 ++++--- 3 files changed, 72 insertions(+), 86 deletions(-) diff --git a/cmd/commands/datasetIngestor.go b/cmd/commands/datasetIngestor.go index e6850be..599d91c 100644 --- a/cmd/commands/datasetIngestor.go +++ b/cmd/commands/datasetIngestor.go @@ -123,12 +123,6 @@ For Windows you need instead to specify -user username:password on the command l return } - // functions use this flag in a way where "nil -> unset" - var allowExistingSourceFolderPtr *bool = &allowExistingSourceFolder - if !noninteractiveFlag && !cmd.Flags().Lookup("allowexistingsource").Changed { - allowExistingSourceFolderPtr = nil - } - if showVersion { fmt.Printf("%s\n", VERSION) return @@ -196,10 +190,10 @@ For Windows you need instead to specify -user username:password on the command l if len(parts) > 3 && parts[3] == "data" { realSourceFolder, err := filepath.EvalSymlinks(line) if err != nil { - log.Fatalf("Failed to find canonical form of sourceFolder:%v %v", line, err) + log.Fatalf("Failed to find canonical form of sourceFolder:%v %v\n", line, err) } color.Set(color.FgYellow) - log.Printf("Transform sourceFolder %v to canonical form: %v", line, realSourceFolder) + log.Printf("Transform sourceFolder %v to canonical form: %v\n", line, realSourceFolder) color.Unset() datasetFolders = append(datasetFolders, realSourceFolder) } else { @@ -211,7 +205,28 @@ For Windows you need instead to specify -user username:password on the command l // log.Printf("Selected folders: %v\n", folders) // test if a sourceFolder already used in the past and give warning - datasetIngestor.TestForExistingSourceFolder(datasetFolders, client, APIServer, user["accessToken"], allowExistingSourceFolderPtr) + foundList, err := datasetIngestor.TestForExistingSourceFolder(datasetFolders, client, APIServer, user["accessToken"]) + if err != nil { + log.Fatal(err) + } + color.Set(color.FgYellow) + fmt.Println("Warning! The following datasets have been found with the same sourceFolder: ") + for _, element := range foundList { + fmt.Printf(" - PID: \"%s\", sourceFolder: \"%s\"\n", element.Pid, element.SourceFolder) + } + color.Unset() + if !allowExistingSourceFolder && len(foundList) > 0 { + if cmd.Flags().Changed("allowexistingsource") { + log.Printf("Do you want to ingest the corresponding new datasets nevertheless (y/N) ? ") + scanner.Scan() + archiveAgain := scanner.Text() + if archiveAgain != "y" { + log.Fatalln("Aborted.") + } + } else { + log.Fatalln("Existing sourceFolders are not allowed. Aborted.") + } + } // TODO ask archive system if sourcefolder is known to them. If yes no copy needed, otherwise // a destination location is defined by the archive system @@ -220,7 +235,7 @@ For Windows you need instead to specify -user username:password on the command l // now everything is prepared, start to loop over all folders var skip = "" // check if skip flag is globally defined via flags: - if cmd.Flags().Lookup("linkfiles").Changed { + if cmd.Flags().Changed("linkfiles") { switch linkfiles { case "delete": skip = "sA" @@ -402,4 +417,5 @@ func init() { datasetIngestorCmd.Flags().String("addcaption", "", "Optional caption to be stored with attachment (single dataset case only)") datasetIngestorCmd.MarkFlagsMutuallyExclusive("testenv", "devenv", "localenv", "tunnelenv") + //datasetIngestorCmd.MarkFlagsMutuallyExclusive("nocopy", "copy") } diff --git a/datasetIngestor/testForExistingSourceFolder.go b/datasetIngestor/testForExistingSourceFolder.go index d1d7aa5..1065e47 100644 --- a/datasetIngestor/testForExistingSourceFolder.go +++ b/datasetIngestor/testForExistingSourceFolder.go @@ -7,20 +7,18 @@ import ( "log" "net/http" "strings" - - "github.com/fatih/color" ) -/* Check if sourceFolders have already been used by existing datasets and give warning +/* + Check if sourceFolders have already been used by existing datasets and give warning The idea is to send ONE query which tests all datasets in one go, (but see chunking need below) -The filter condition can be defined within the header instead of the url +# The filter condition can be defined within the header instead of the url The filter size limit is server dependent: typically 8kB for the header, for URL length 2 kB (IE) Both limits may well be exceeed e.g. for 400 datasets Therefore split query into many chunks if too many folders are used in one job - */ type DatasetInfo struct { Pid string `json:"pid"` @@ -28,7 +26,7 @@ type DatasetInfo struct { Size int `json:"size"` } -type QueryResult []DatasetInfo +type DatasetQuery []DatasetInfo /* TestForExistingSourceFolder checks if the provided source folders already exist on the API server. @@ -42,38 +40,31 @@ Parameters: The function splits the folders into chunks of 100 and sends a GET request to the API server for each chunk. If a source folder already exists, the function logs a warning and asks the user if they want to continue. If the user chooses not to continue, the function stops the process and logs an error message. */ -func TestForExistingSourceFolder(folders []string, client *http.Client, APIServer string, accessToken string, allowExistingSourceFolder *bool) { +func TestForExistingSourceFolder(folders []string, client *http.Client, APIServer string, accessToken string) (foundList DatasetQuery, err error) { // Split into chunks of 100 sourceFolders const chunkSize = 100 all := len(folders) chunks := (all-1)/chunkSize + 1 url := APIServer + "/Datasets?access_token=" + accessToken - - if allowExistingSourceFolder == nil || !(*allowExistingSourceFolder) { - for i := 0; i < chunks; i++ { - start := i * chunkSize - end := start + chunkSize - if end > all { - end = all - } - log.Printf("Checking sourceFolder %v to %v for existing entries...\n", start+1, end) - - sourceFolderList := strings.Join(folders[start:end], "\",\"") - filter := createFilter(sourceFolderList) - resp := makeRequest(client, url, filter) - respObj := processResponse(resp) - - if len(respObj) > 0 { - var item DatasetInfo - for _, item = range respObj { - log.Printf("Folder: %v, size: %v, PID: %v", item.SourceFolder, item.Size, item.Pid) - } - if !confirmIngestion(allowExistingSourceFolder) { - log.Fatalf("Use the flag -allowexistingsource to ingest nevertheless\n") - } - } + + for i := 0; i < chunks; i++ { + start := i * chunkSize + end := start + chunkSize + if end > all { + end = all + } + // TODO remove logs + log.Printf("Checking sourceFolder %v to %v for existing entries...\n", start+1, end) + + sourceFolderList := strings.Join(folders[start:end], "\",\"") + filter := createFilter(sourceFolderList) + resp, err := makeRequest(client, url, filter) + if err != nil { + return DatasetQuery{}, err } + foundList = append(foundList, processResponse(resp)...) } + return foundList, nil } func createFilter(sourceFolderList string) string { @@ -82,56 +73,34 @@ func createFilter(sourceFolderList string) string { return fmt.Sprintf("%s%s%s", header, sourceFolderList, tail) } -func makeRequest(client *http.Client, url string, filter string) *http.Response { +func makeRequest(client *http.Client, url string, filter string) (*http.Response, error) { req, err := http.NewRequest("GET", url, nil) if err != nil { - log.Fatal("Error creating request: ", err) + return nil, err } req.Header.Set("Content-Type", "application/json") req.Header.Set("filter", filter) - + resp, err := client.Do(req) if err != nil { log.Fatal(err) } defer resp.Body.Close() - return resp + return resp, nil } -func processResponse(resp *http.Response) QueryResult { +func processResponse(resp *http.Response) DatasetQuery { body, _ := io.ReadAll(resp.Body) - var respObj QueryResult + var respObj DatasetQuery if len(body) == 0 { + // TODO remove logs log.Printf("Warning: Response body is empty") return respObj } err := json.Unmarshal(body, &respObj) if err != nil { + // TODO remove logs log.Printf("Error: Failed to parse JSON response: %v", err) } return respObj } - -func confirmIngestion(allowExistingSourceFolder *bool) bool { - color.Set(color.FgYellow) - log.Printf("Warning: The following sourceFolders have already been used") - continueFlag := true - if allowExistingSourceFolder == nil { - log.Printf("Do you want to ingest the corresponding new datasets nevertheless (y/N) ? ") - scanner.Scan() - archiveAgain := scanner.Text() - if archiveAgain != "y" { - continueFlag = false - } - } else { - continueFlag = *allowExistingSourceFolder - } - if continueFlag { - log.Printf("You chose to continue the new datasets nevertheless\n") - } else { - log.Printf("You chose not to continue\n") - log.Printf("Therefore the ingest process is stopped here, no datasets will be ingested\n") - } - color.Unset() - return continueFlag -} diff --git a/datasetIngestor/testForExistingSourceFolder_test.go b/datasetIngestor/testForExistingSourceFolder_test.go index 2dc2c27..7ea4b9e 100644 --- a/datasetIngestor/testForExistingSourceFolder_test.go +++ b/datasetIngestor/testForExistingSourceFolder_test.go @@ -1,12 +1,13 @@ package datasetIngestor import ( + "bytes" + "io" "net/http" "net/http/httptest" "testing" + "github.com/stretchr/testify/assert" - "io" - "bytes" ) func TestTestForExistingSourceFolder(t *testing.T) { @@ -20,18 +21,18 @@ func TestTestForExistingSourceFolder(t *testing.T) { })) // Close the server when test finishes defer server.Close() - + // Use Client & URL from our local test server client := server.Client() APIServer := server.URL accessToken := "testToken" - allowExistingSourceFolder := false - + folders := []string{"folder1", "folder2"} - - TestForExistingSourceFolder(folders, client, APIServer, accessToken, &allowExistingSourceFolder) + + // TODO test the results of this function + TestForExistingSourceFolder(folders, client, APIServer, accessToken) }) - + t.Run("test with existing folders and allowExistingSourceFolder true", func(t *testing.T) { // Create a mock server server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { @@ -42,16 +43,16 @@ func TestTestForExistingSourceFolder(t *testing.T) { })) // Close the server when test finishes defer server.Close() - + // Use Client & URL from our local test server client := server.Client() APIServer := server.URL accessToken := "testToken" - allowExistingSourceFolder := true - + folders := []string{"folder1", "folder2"} - - TestForExistingSourceFolder(folders, client, APIServer, accessToken, &allowExistingSourceFolder) + + // TODO test the results of this function. + TestForExistingSourceFolder(folders, client, APIServer, accessToken) }) } @@ -65,7 +66,7 @@ func TestProcessResponse(t *testing.T) { if len(result) != 1 || result[0].Pid != "123" || result[0].SourceFolder != "folder" || result[0].Size != 100 { t.Errorf("Unexpected result: %v", result) } - + // Test with invalid JSON invalidJSON := `{"pid": "123", "sourceFolder": "folder", "size": 100}` resp = &http.Response{ @@ -75,7 +76,7 @@ func TestProcessResponse(t *testing.T) { if len(result) != 0 { t.Errorf("Expected empty QueryResult, got '%v'", result) } - + // Test with empty body resp = &http.Response{ Body: io.NopCloser(bytes.NewBufferString("")), From d17f935b1c66a89b1e12a054af711b8e833f9e00 Mon Sep 17 00:00:00 2001 From: Attila Martin Nacsa Date: Mon, 29 Jul 2024 17:52:57 +0200 Subject: [PATCH 2/5] improve assmebleFilelisting, datasetIngestor command --- cmd/commands/datasetIngestor.go | 86 ++++---- datasetIngestor/assembleFilelisting.go | 217 +++++++++++--------- datasetIngestor/assembleFilelisting_test.go | 15 +- 3 files changed, 171 insertions(+), 147 deletions(-) diff --git a/cmd/commands/datasetIngestor.go b/cmd/commands/datasetIngestor.go index 599d91c..5851826 100644 --- a/cmd/commands/datasetIngestor.go +++ b/cmd/commands/datasetIngestor.go @@ -105,21 +105,25 @@ For Windows you need instead to specify -user username:password on the command l } metadatafile := args[0] - filelistingPath := "" - folderlistingPath := "" + datasetFileListTxt := "" + folderListingTxt := "" absFileListing := "" if len(args) == 2 { - if args[1] == "folderlisting.txt" { - folderlistingPath = args[1] + argFileName := filepath.Base(args[1]) + if argFileName == "folderlisting.txt" { + // NOTE folderListingTxt is a TEXT FILE that lists dataset folders that should all be ingested together + // WITH the same metadata EXCEPT for the sourceFolder path (which is set during ingestion) + folderListingTxt = args[1] } else { - // NOTE filelistingPath is some kind of path to which the sourceFolder path should be relative - filelistingPath = args[1] - absFileListing, _ = filepath.Abs(filelistingPath) + // NOTE datasetFileListTxt is a TEXT FILE that lists the files & folders of a dataset (contained in a folder) + // that should be considered as "part of" the dataset. The paths must be relative to the sourceFolder. + datasetFileListTxt = args[1] + absFileListing, _ = filepath.Abs(datasetFileListTxt) } } if datasetUtils.TestArgs != nil { - datasetUtils.TestArgs([]interface{}{metadatafile, filelistingPath, folderlistingPath}) + datasetUtils.TestArgs([]interface{}{metadatafile, datasetFileListTxt, folderListingTxt}) return } @@ -164,48 +168,49 @@ For Windows you need instead to specify -user username:password on the command l /* TODO Add info about policy settings and that autoarchive will take place or not */ - metaDataMap, metaSourceFolder, beamlineAccount, err := datasetIngestor.CheckMetadata(client, APIServer, metadatafile, user, accessGroups) + metaDataMap, metadataSourceFolder, beamlineAccount, err := datasetIngestor.CheckMetadata(client, APIServer, metadatafile, user, accessGroups) if err != nil { log.Fatal("Error in CheckMetadata function: ", err) } //log.Printf("metadata object: %v\n", metaDataMap) - // assemble list of datasetFolders (=datasets) to be created - var datasetFolders []string - if folderlistingPath == "" { - datasetFolders = append(datasetFolders, metaSourceFolder) + // assemble list of datasetPaths (=datasets) to be created + var datasetPaths []string + if folderListingTxt == "" { + datasetPaths = append(datasetPaths, metadataSourceFolder) } else { // get folders from file - folderlist, err := os.ReadFile(folderlistingPath) + folderlist, err := os.ReadFile(folderListingTxt) if err != nil { log.Fatal(err) } lines := strings.Split(string(folderlist), "\n") // remove all empty and comment lines for _, line := range lines { - if line != "" && string(line[0]) != "#" { - // NOTE what is this special third level "data" folder that needs to be unsymlinked? - // convert into canonical form only for certain online data linked from eaccounts home directories - var parts = strings.Split(line, "/") - if len(parts) > 3 && parts[3] == "data" { - realSourceFolder, err := filepath.EvalSymlinks(line) - if err != nil { - log.Fatalf("Failed to find canonical form of sourceFolder:%v %v\n", line, err) - } - color.Set(color.FgYellow) - log.Printf("Transform sourceFolder %v to canonical form: %v\n", line, realSourceFolder) - color.Unset() - datasetFolders = append(datasetFolders, realSourceFolder) - } else { - datasetFolders = append(datasetFolders, line) + if line == "" || string(line[0]) == "#" { + continue + } + // NOTE what is this special third level "data" folder that needs to be unsymlinked? + // convert into canonical form only for certain online data linked from eaccounts home directories + var parts = strings.Split(line, "/") + if len(parts) > 3 && parts[3] == "data" { + realSourceFolder, err := filepath.EvalSymlinks(line) + if err != nil { + log.Fatalf("Failed to find canonical form of sourceFolder:%v %v\n", line, err) } + color.Set(color.FgYellow) + log.Printf("Transform sourceFolder %v to canonical form: %v\n", line, realSourceFolder) + color.Unset() + datasetPaths = append(datasetPaths, realSourceFolder) + } else { + datasetPaths = append(datasetPaths, line) } } } // log.Printf("Selected folders: %v\n", folders) // test if a sourceFolder already used in the past and give warning - foundList, err := datasetIngestor.TestForExistingSourceFolder(datasetFolders, client, APIServer, user["accessToken"]) + foundList, err := datasetIngestor.TestForExistingSourceFolder(datasetPaths, client, APIServer, user["accessToken"]) if err != nil { log.Fatal(err) } @@ -247,17 +252,20 @@ For Windows you need instead to specify -user username:password on the command l } var datasetList []string - for _, sourceFolder := range datasetFolders { + for _, datasetSourceFolder := range datasetPaths { // ignore empty lines - if sourceFolder == "" { + if datasetSourceFolder == "" { // NOTE if there are empty source folder(s), shouldn't we raise an error? continue } - metaDataMap["sourceFolder"] = sourceFolder - log.Printf("Scanning files in dataset %s", sourceFolder) + metaDataMap["sourceFolder"] = datasetSourceFolder + log.Printf("Scanning files in dataset %s", datasetSourceFolder) - fullFileArray, startTime, endTime, owner, numFiles, totalSize := - datasetIngestor.AssembleFilelisting(sourceFolder, filelistingPath, &skip) + fullFileArray, startTime, endTime, owner, numFiles, totalSize, err := + datasetIngestor.GetLocalFileList(datasetSourceFolder, datasetFileListTxt, &skip) + if err != nil { + log.Fatalf("Can't gather the filelist of \"%s\"", datasetSourceFolder) + } //log.Printf("full fileListing: %v\n Start and end time: %s %s\n ", fullFileArray, startTime, endTime) log.Printf("The dataset contains %v files with a total size of %v bytes.", numFiles, totalSize) @@ -283,10 +291,10 @@ For Windows you need instead to specify -user username:password on the command l // and unless copy flag defined via command line if !copyFlag && !nocopyFlag { // NOTE this whole copyFlag, nocopyFlag ordeal makes no sense whatsoever if !beamlineAccount { - err := datasetIngestor.CheckDataCentrallyAvailable(user["username"], RSYNCServer, sourceFolder) + err := datasetIngestor.CheckDataCentrallyAvailable(user["username"], RSYNCServer, datasetSourceFolder) if err != nil { color.Set(color.FgYellow) - log.Printf("The source folder %v is not centrally available (decentral use case).\nThe data must first be copied to a rsync cache server.\n ", sourceFolder) + log.Printf("The source folder %v is not centrally available (decentral use case).\nThe data must first be copied to a rsync cache server.\n ", datasetSourceFolder) color.Unset() copyFlag = true // check if user account @@ -338,7 +346,7 @@ For Windows you need instead to specify -user username:password on the command l datasetIngestor.AddAttachment(client, APIServer, datasetId, metaDataMap, user["accessToken"], addAttachment, addCaption) } if copyFlag { - err := datasetIngestor.SyncDataToFileserver(datasetId, user, RSYNCServer, sourceFolder, absFileListing) + err := datasetIngestor.SyncDataToFileserver(datasetId, user, RSYNCServer, datasetSourceFolder, absFileListing) if err == nil { // delayed enabling archivable = true diff --git a/datasetIngestor/assembleFilelisting.go b/datasetIngestor/assembleFilelisting.go index 3fc89e5..f2556d7 100644 --- a/datasetIngestor/assembleFilelisting.go +++ b/datasetIngestor/assembleFilelisting.go @@ -26,7 +26,9 @@ type Datafile struct { var skippedLinks = 0 var illegalFileNames = 0 var errorGroupIds = 0 + const windows = "windows" + var scanner = bufio.NewScanner(os.Stdin) // readLines reads a whole file into memory @@ -47,7 +49,7 @@ func readLines(path string) ([]string, error) { } /* -AssembleFilelisting scans a source folder and optionally a file listing, and returns a list of data files, the earliest and latest modification times, the owner, the number of files, and the total size of the files. +GetLocalFileList scans a source folder and optionally a file listing, and returns a list of data files, the earliest and latest modification times, the owner, the number of files, and the total size of the files. Parameters: - sourceFolder: The path to the source folder to scan. @@ -68,18 +70,17 @@ Returns: The function logs an error and returns if it cannot change the working directory to the source folder. */ -func AssembleFilelisting(sourceFolder string, filelistingPath string, skip *string) (fullFileArray []Datafile, startTime time.Time, endTime time.Time, owner string, numFiles int64, totalSize int64) { +func GetLocalFileList(sourceFolder string, filelistingPath string, skip *string) (fullFileArray []Datafile, startTime time.Time, endTime time.Time, owner string, numFiles int64, totalSize int64, err error) { // scan all lines //fmt.Println("sourceFolder,listing:", sourceFolder, filelistingPath) fullFileArray = make([]Datafile, 0) - startTime = time.Date(2100, 1, 1, 12, 0, 0, 0, time.UTC) + startTime = time.Date(2500, 1, 1, 12, 0, 0, 0, time.UTC) endTime = time.Date(2000, 1, 1, 12, 0, 0, 0, time.UTC) owner = "" numFiles = 0 totalSize = 0 var lines []string - var err error if filelistingPath == "" { log.Printf("No explicit filelistingPath defined - full folder %s is used.\n", sourceFolder) @@ -97,24 +98,29 @@ func AssembleFilelisting(sourceFolder string, filelistingPath string, skip *stri // TODO verify that filelisting have no overlap, e.g. no lines X/ and X/Y, // because the latter is already contained in X/ - // restore cwd after function - cwd, _ := os.Getwd() - defer os.Chdir(cwd) + // restore oldWorkDir after function + oldWorkDir, err := os.Getwd() + if err != nil { + return fullFileArray, startTime, endTime, owner, numFiles, totalSize, err + } + + defer os.Chdir(oldWorkDir) // for windows source path add colon in the leading drive character // windowsSource := strings.Replace(sourceFolder, "/C/", "C:/", 1) - osSource := sourceFolder if runtime.GOOS == windows { re := regexp.MustCompile(`^\/([A-Z])\/`) - osSource = re.ReplaceAllString(sourceFolder, "$1:/") + sourceFolder = re.ReplaceAllString(sourceFolder, "$1:/") } - if err := os.Chdir(osSource); err != nil { + if err := os.Chdir(sourceFolder); err != nil { log.Printf("Can not step into sourceFolder %v - dataset will be ignored.\n", sourceFolder) - return fullFileArray, startTime, endTime, owner, numFiles, totalSize - } else { - dir, _ := os.Getwd() - log.Printf("Scanning source folder: %s at %s", sourceFolder, dir) + return fullFileArray, startTime, endTime, owner, numFiles, totalSize, err + } + dir, err := os.Getwd() + if err != nil { + return fullFileArray, startTime, endTime, owner, numFiles, totalSize, err } + log.Printf("Scanning source folder: %s at %s", sourceFolder, dir) // reinitaialize *skip variable unless valid for All Datasets @@ -146,41 +152,50 @@ func AssembleFilelisting(sourceFolder string, filelistingPath string, skip *stri if f.IsDir() && f.Name() == "." { return nil } + // extract OS dependent owner IDs and translate to names - if err == nil { - uidName, gidName := GetFileOwner(f) - // replace backslashes for windows path - modpath := path - if runtime.GOOS == windows { - modpath = strings.Replace(path, "\\", "/", -1) - } - fileStruct := Datafile{Path: modpath, User: uidName, Group: gidName, Perm: f.Mode().String(), Size: f.Size(), Time: f.ModTime().Format(time.RFC3339)} - keep := true - if f.Mode()&os.ModeSymlink == os.ModeSymlink { - pointee, _ := os.Readlink(modpath) // just pass the file name - if !filepath.IsAbs(pointee) { - dir, err := os.Getwd() - // log.Printf(" CWD path pointee :%v %v %v", dir, filepath.Dir(path), pointee) - pabs := filepath.Join(dir, filepath.Dir(modpath), pointee) - pointee, err = filepath.EvalSymlinks(pabs) - if err != nil { - log.Printf("Could not follow symlink for file:%v %v", pabs, err) - keep = false - log.Printf("keep variable set to %v", keep) - } + if err != nil { + // stop function if err given by Walk is not nil + return err + } + uidName, gidName := GetFileOwner(f) + // replace backslashes for windows path + modpath := path + if runtime.GOOS == windows { + modpath = strings.Replace(path, "\\", "/", -1) + } + fileStruct := Datafile{Path: modpath, User: uidName, Group: gidName, Perm: f.Mode().String(), Size: f.Size(), Time: f.ModTime().Format(time.RFC3339)} + keep := true + + // * handle symlinks * + if f.Mode()&os.ModeSymlink != 0 { + pointee, _ := os.Readlink(modpath) // just pass the file name + if !filepath.IsAbs(pointee) { + dir, err := os.Getwd() + if err != nil { + return err } - //fmt.Printf("Skip variable:%v\n", *skip) - if *skip == "ka" || *skip == "kA" { - keep = true - } else if *skip == "sa" || *skip == "sA" { + // log.Printf(" CWD path pointee :%v %v %v", dir, filepath.Dir(path), pointee) + pabs := filepath.Join(dir, filepath.Dir(modpath), pointee) + pointee, err = filepath.EvalSymlinks(pabs) + if err != nil { + log.Printf("Could not follow symlink for file:%v %v", pabs, err) keep = false - } else if *skip == "da" || *skip == "dA" { - keep = strings.HasPrefix(pointee, sourceFolder) - } else { - color.Set(color.FgYellow) - log.Printf("Warning: the file %s is a link pointing to %v.", modpath, pointee) - color.Unset() - log.Printf(` + log.Printf("keep variable set to %v", keep) + } + } + //fmt.Printf("Skip variable:%v\n", *skip) + if *skip == "ka" || *skip == "kA" { + keep = true + } else if *skip == "sa" || *skip == "sA" { + keep = false + } else if *skip == "da" || *skip == "dA" { + keep = strings.HasPrefix(pointee, sourceFolder) + } else { + color.Set(color.FgYellow) + log.Printf("Warning: the file %s is a link pointing to %v.", modpath, pointee) + color.Unset() + log.Printf(` Please test if this link is meaningful and not pointing outside the sourceFolder %s. The default behaviour is to keep only internal links within a source folder. @@ -189,70 +204,68 @@ subsequent links within the current dataset, by appending an a (dA,ka,sa). If you want to give the same answer even to all subsequent datasets in this command then specify a capital 'A', e.g. (dA,kA,sA) Do you want to keep the link in dataset or skip it (D(efault)/k(eep)/s(kip) ?`, sourceFolder) - scanner.Scan() - *skip = scanner.Text() - if *skip == "" { - *skip = "d" - } - if *skip == "d" || *skip == "dA" { - keep = strings.HasPrefix(pointee, sourceFolder) - } else { - keep = (*skip != "s" && *skip != "sa" && *skip != "sA") - } + scanner.Scan() + *skip = scanner.Text() + if *skip == "" { + *skip = "d" } - if keep { - color.Set(color.FgGreen) - log.Printf("You chose to keep the link %v -> %v.\n\n", modpath, pointee) + if *skip == "d" || *skip == "dA" { + keep = strings.HasPrefix(pointee, sourceFolder) } else { - color.Set(color.FgRed) - skippedLinks++ - log.Printf("You chose to remove the link %v -> %v.\n\n", modpath, pointee) + keep = (*skip != "s" && *skip != "sa" && *skip != "sA") } - color.Unset() } - - // make sure that filenames do not contain characters like "\" or "*" - if strings.ContainsAny(modpath, "*\\") { + if keep { + color.Set(color.FgGreen) + log.Printf("You chose to keep the link %v -> %v.\n\n", modpath, pointee) + } else { color.Set(color.FgRed) - log.Printf("Warning: the file %s contains illegal characters like *,\\ and will not be archived.", modpath) - color.Unset() - illegalFileNames++ - keep = false + skippedLinks++ + log.Printf("You chose to remove the link %v -> %v.\n\n", modpath, pointee) } - // and check for triple blanks, they are used to separate columns in messages - if keep && strings.Contains(modpath, " ") { - color.Set(color.FgRed) - log.Printf("Warning: the file %s contains 3 consecutive blanks which is not allowed. The file not be archived.", modpath) - color.Unset() - illegalFileNames++ - keep = false + color.Unset() + } + + // * filter invalid filenames * + // make sure that filenames do not contain characters like "\" or "*" + if strings.ContainsAny(modpath, "*\\") { + color.Set(color.FgRed) + log.Printf("Warning: the file %s contains illegal characters like *,\\ and will not be archived.", modpath) + color.Unset() + illegalFileNames++ + keep = false + } + // and check for triple blanks, they are used to separate columns in messages + if keep && strings.Contains(modpath, " ") { + color.Set(color.FgRed) + log.Printf("Warning: the file %s contains 3 consecutive blanks which is not allowed. The file not be archived.", modpath) + color.Unset() + illegalFileNames++ + keep = false + } + if keep { + numFiles++ + totalSize += f.Size() + //fmt.Println(numFiles, totalSize) + //fullFileArray = append(fullFileArray, fileline) + fullFileArray = append(fullFileArray, fileStruct) + // find out earlist creation time + modTime := f.ModTime() + //fmt.Printf("FileTime:", modTime) + diff := modTime.Sub(startTime) + if diff < (time.Duration(0) * time.Second) { + startTime = modTime + // fmt.Printf("Earliest Time:%v\n", startTime) } - if keep { - numFiles++ - totalSize += f.Size() - //fmt.Println(numFiles, totalSize) - //fullFileArray = append(fullFileArray, fileline) - fullFileArray = append(fullFileArray, fileStruct) - // find out earlist creation time - modTime := f.ModTime() - //fmt.Printf("FileTime:", modTime) - diff := modTime.Sub(startTime) - if diff < (time.Duration(0) * time.Second) { - startTime = modTime - // fmt.Printf("Earliest Time:%v\n", startTime) - } - diff = modTime.Sub(endTime) - if diff > (time.Duration(0) * time.Second) { - endTime = modTime - //fmt.Printf("Last Time:%v\n", endTime) - } - owner = gidName + diff = modTime.Sub(endTime) + if diff > (time.Duration(0) * time.Second) { + endTime = modTime + //fmt.Printf("Last Time:%v\n", endTime) } - - //log.Println("Path:",modpath) - } else { - log.Println("Error:", err) + owner = gidName } + + //log.Println("Path:",modpath) return err }) @@ -261,7 +274,7 @@ Do you want to keep the link in dataset or skip it (D(efault)/k(eep)/s(kip) ?`, } } // spin.Stop() - return fullFileArray, startTime, endTime, owner, numFiles, totalSize + return fullFileArray, startTime, endTime, owner, numFiles, totalSize, err } func PrintFileInfos() { diff --git a/datasetIngestor/assembleFilelisting_test.go b/datasetIngestor/assembleFilelisting_test.go index 748493b..e9d2d75 100644 --- a/datasetIngestor/assembleFilelisting_test.go +++ b/datasetIngestor/assembleFilelisting_test.go @@ -14,18 +14,21 @@ func TestAssembleFilelisting(t *testing.T) { t.Fatalf("Failed to create temp directory: %s", err) } defer os.RemoveAll(tempDir) - + // Create a file in the temporary directory fileName := "testfile" filePath := filepath.Join(tempDir, fileName) if err := os.WriteFile(filePath, []byte("test"), 0644); err != nil { t.Fatalf("Failed to create test file: %s", err) } - + // Call AssembleFilelisting on the temporary directory skip := "" - fullFileArray, startTime, endTime, _, numFiles, totalSize := AssembleFilelisting(tempDir, "", &skip) - + fullFileArray, startTime, endTime, _, numFiles, totalSize, err := GetLocalFileList(tempDir, "", &skip) + if err != nil { + t.Errorf("got error: %v", err) + } + // Check that the returned file array contains the correct file if len(fullFileArray) != 1 { t.Fatalf("Expected 1 file, got %d", len(fullFileArray)) @@ -43,10 +46,10 @@ func TestAssembleFilelisting(t *testing.T) { if time.Since(fileTime) > time.Second { t.Errorf("Expected file time within 1 second of now") } - + // Check the other outputs of AssembleFilelisting if time.Since(startTime) > time.Second { - t.Errorf("Expected start time within 1 second of now") + t.Errorf("Expected start time within 1 second of now - res: %s", startTime) } if time.Since(endTime) > time.Second { t.Errorf("Expected end time within 1 second of now") From 536181fc4419bcfe08b91528314005c6707fcb49 Mon Sep 17 00:00:00 2001 From: consolethinks Date: Wed, 7 Aug 2024 17:34:09 +0200 Subject: [PATCH 3/5] testForExistingSourceFolder: remove logging --- .../testForExistingSourceFolder.go | 26 +++++++++---------- .../testForExistingSourceFolder_test.go | 15 ++++++++--- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/datasetIngestor/testForExistingSourceFolder.go b/datasetIngestor/testForExistingSourceFolder.go index 1065e47..177c2d0 100644 --- a/datasetIngestor/testForExistingSourceFolder.go +++ b/datasetIngestor/testForExistingSourceFolder.go @@ -4,7 +4,6 @@ import ( "encoding/json" "fmt" "io" - "log" "net/http" "strings" ) @@ -53,8 +52,7 @@ func TestForExistingSourceFolder(folders []string, client *http.Client, APIServe if end > all { end = all } - // TODO remove logs - log.Printf("Checking sourceFolder %v to %v for existing entries...\n", start+1, end) + //log.Printf("Checking sourceFolder %v to %v for existing entries...\n", start+1, end) sourceFolderList := strings.Join(folders[start:end], "\",\"") filter := createFilter(sourceFolderList) @@ -62,9 +60,13 @@ func TestForExistingSourceFolder(folders []string, client *http.Client, APIServe if err != nil { return DatasetQuery{}, err } - foundList = append(foundList, processResponse(resp)...) + processedResp, err := processResponse(resp) + if err != nil { + return foundList, err + } + foundList = append(foundList, processedResp...) } - return foundList, nil + return foundList, err } func createFilter(sourceFolderList string) string { @@ -83,24 +85,22 @@ func makeRequest(client *http.Client, url string, filter string) (*http.Response resp, err := client.Do(req) if err != nil { - log.Fatal(err) + return nil, err } defer resp.Body.Close() return resp, nil } -func processResponse(resp *http.Response) DatasetQuery { +func processResponse(resp *http.Response) (DatasetQuery, error) { body, _ := io.ReadAll(resp.Body) var respObj DatasetQuery if len(body) == 0 { - // TODO remove logs - log.Printf("Warning: Response body is empty") - return respObj + // ignoring empty response... + return respObj, nil } err := json.Unmarshal(body, &respObj) if err != nil { - // TODO remove logs - log.Printf("Error: Failed to parse JSON response: %v", err) + return DatasetQuery{}, fmt.Errorf("failed to parse JSON response: %v", err) } - return respObj + return respObj, nil } diff --git a/datasetIngestor/testForExistingSourceFolder_test.go b/datasetIngestor/testForExistingSourceFolder_test.go index 7ea4b9e..7bdc876 100644 --- a/datasetIngestor/testForExistingSourceFolder_test.go +++ b/datasetIngestor/testForExistingSourceFolder_test.go @@ -62,7 +62,10 @@ func TestProcessResponse(t *testing.T) { resp := &http.Response{ Body: io.NopCloser(bytes.NewBufferString(validJSON)), } - result := processResponse(resp) + result, err := processResponse(resp) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } if len(result) != 1 || result[0].Pid != "123" || result[0].SourceFolder != "folder" || result[0].Size != 100 { t.Errorf("Unexpected result: %v", result) } @@ -72,7 +75,10 @@ func TestProcessResponse(t *testing.T) { resp = &http.Response{ Body: io.NopCloser(bytes.NewBufferString(invalidJSON)), } - result = processResponse(resp) + result, err = processResponse(resp) + if err == nil { + t.Errorf("Expected error from processResponse") + } if len(result) != 0 { t.Errorf("Expected empty QueryResult, got '%v'", result) } @@ -81,7 +87,10 @@ func TestProcessResponse(t *testing.T) { resp = &http.Response{ Body: io.NopCloser(bytes.NewBufferString("")), } - result = processResponse(resp) + result, err = processResponse(resp) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } if len(result) != 0 { t.Errorf("Expected empty QueryResult, got '%v'", result) } From add9f2e53a1cad38b0b83b41fa4dd45aa40b69aa Mon Sep 17 00:00:00 2001 From: consolethinks Date: Fri, 9 Aug 2024 17:35:29 +0200 Subject: [PATCH 4/5] testForExistingSourceFolder: fix a bug that discarded the gathered list --- datasetIngestor/testForExistingSourceFolder.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasetIngestor/testForExistingSourceFolder.go b/datasetIngestor/testForExistingSourceFolder.go index 177c2d0..ca1428b 100644 --- a/datasetIngestor/testForExistingSourceFolder.go +++ b/datasetIngestor/testForExistingSourceFolder.go @@ -60,6 +60,7 @@ func TestForExistingSourceFolder(folders []string, client *http.Client, APIServe if err != nil { return DatasetQuery{}, err } + defer resp.Body.Close() processedResp, err := processResponse(resp) if err != nil { return foundList, err @@ -87,7 +88,6 @@ func makeRequest(client *http.Client, url string, filter string) (*http.Response if err != nil { return nil, err } - defer resp.Body.Close() return resp, nil } From 505b7ae1659d61e755ec77ca9618188408967f68 Mon Sep 17 00:00:00 2001 From: consolethinks Date: Mon, 19 Aug 2024 17:18:24 +0200 Subject: [PATCH 5/5] remove commented out logs --- datasetIngestor/assembleFilelisting.go | 8 -------- datasetIngestor/testForExistingSourceFolder.go | 1 - 2 files changed, 9 deletions(-) diff --git a/datasetIngestor/assembleFilelisting.go b/datasetIngestor/assembleFilelisting.go index f2556d7..74c8238 100644 --- a/datasetIngestor/assembleFilelisting.go +++ b/datasetIngestor/assembleFilelisting.go @@ -132,15 +132,9 @@ func GetLocalFileList(sourceFolder string, filelistingPath string, skip *string) // spin.Color("green") for _, line := range lines { - // log.Printf("Inside lines loop:%s\n", line) if len(line) == 0 { continue } - // do not printout the default "./" case - // if len(line) > 2 { - // log.Printf("Line: %s %d\n", line, len(line)) - //} - // if folder make recursive ls // spin.Start() // Start the spinner e := filepath.Walk(line, func(path string, f os.FileInfo, err error) error { @@ -175,7 +169,6 @@ func GetLocalFileList(sourceFolder string, filelistingPath string, skip *string) if err != nil { return err } - // log.Printf(" CWD path pointee :%v %v %v", dir, filepath.Dir(path), pointee) pabs := filepath.Join(dir, filepath.Dir(modpath), pointee) pointee, err = filepath.EvalSymlinks(pabs) if err != nil { @@ -265,7 +258,6 @@ Do you want to keep the link in dataset or skip it (D(efault)/k(eep)/s(kip) ?`, owner = gidName } - //log.Println("Path:",modpath) return err }) diff --git a/datasetIngestor/testForExistingSourceFolder.go b/datasetIngestor/testForExistingSourceFolder.go index ca1428b..72f8b3e 100644 --- a/datasetIngestor/testForExistingSourceFolder.go +++ b/datasetIngestor/testForExistingSourceFolder.go @@ -52,7 +52,6 @@ func TestForExistingSourceFolder(folders []string, client *http.Client, APIServe if end > all { end = all } - //log.Printf("Checking sourceFolder %v to %v for existing entries...\n", start+1, end) sourceFolderList := strings.Join(folders[start:end], "\",\"") filter := createFilter(sourceFolderList)