Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor/files functions #98

Merged
merged 5 commits into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 70 additions & 46 deletions cmd/commands/datasetIngestor.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,30 +105,28 @@ For Windows you need instead to specify -user username:password on the command l
}

metadatafile := args[0]
filelistingPath := ""
folderlistingPath := ""
datasetFileListTxt := ""
folderListingTxt := ""
absFileListing := ""
if len(args) == 2 {
if args[1] == "folderlisting.txt" {
folderlistingPath = args[1]
argFileName := filepath.Base(args[1])
if argFileName == "folderlisting.txt" {
// NOTE folderListingTxt is a TEXT FILE that lists dataset folders that should all be ingested together
// WITH the same metadata EXCEPT for the sourceFolder path (which is set during ingestion)
folderListingTxt = args[1]
} else {
// NOTE filelistingPath is some kind of path to which the sourceFolder path should be relative
filelistingPath = args[1]
absFileListing, _ = filepath.Abs(filelistingPath)
// NOTE datasetFileListTxt is a TEXT FILE that lists the files & folders of a dataset (contained in a folder)
// that should be considered as "part of" the dataset. The paths must be relative to the sourceFolder.
datasetFileListTxt = args[1]
absFileListing, _ = filepath.Abs(datasetFileListTxt)
}
}

if datasetUtils.TestArgs != nil {
datasetUtils.TestArgs([]interface{}{metadatafile, filelistingPath, folderlistingPath})
datasetUtils.TestArgs([]interface{}{metadatafile, datasetFileListTxt, folderListingTxt})
return
}

// functions use this flag in a way where "nil -> unset"
var allowExistingSourceFolderPtr *bool = &allowExistingSourceFolder
if !noninteractiveFlag && !cmd.Flags().Lookup("allowexistingsource").Changed {
allowExistingSourceFolderPtr = nil
}

if showVersion {
fmt.Printf("%s\n", VERSION)
return
Expand Down Expand Up @@ -170,48 +168,70 @@ For Windows you need instead to specify -user username:password on the command l

/* TODO Add info about policy settings and that autoarchive will take place or not */

metaDataMap, metaSourceFolder, beamlineAccount, err := datasetIngestor.CheckMetadata(client, APIServer, metadatafile, user, accessGroups)
metaDataMap, metadataSourceFolder, beamlineAccount, err := datasetIngestor.CheckMetadata(client, APIServer, metadatafile, user, accessGroups)
if err != nil {
log.Fatal("Error in CheckMetadata function: ", err)
}
//log.Printf("metadata object: %v\n", metaDataMap)

// assemble list of datasetFolders (=datasets) to be created
var datasetFolders []string
if folderlistingPath == "" {
datasetFolders = append(datasetFolders, metaSourceFolder)
// assemble list of datasetPaths (=datasets) to be created
var datasetPaths []string
if folderListingTxt == "" {
datasetPaths = append(datasetPaths, metadataSourceFolder)
} else {
// get folders from file
folderlist, err := os.ReadFile(folderlistingPath)
folderlist, err := os.ReadFile(folderListingTxt)
if err != nil {
log.Fatal(err)
}
lines := strings.Split(string(folderlist), "\n")
// remove all empty and comment lines
for _, line := range lines {
if line != "" && string(line[0]) != "#" {
// NOTE what is this special third level "data" folder that needs to be unsymlinked?
// convert into canonical form only for certain online data linked from eaccounts home directories
var parts = strings.Split(line, "/")
if len(parts) > 3 && parts[3] == "data" {
realSourceFolder, err := filepath.EvalSymlinks(line)
if err != nil {
log.Fatalf("Failed to find canonical form of sourceFolder:%v %v", line, err)
}
color.Set(color.FgYellow)
log.Printf("Transform sourceFolder %v to canonical form: %v", line, realSourceFolder)
color.Unset()
datasetFolders = append(datasetFolders, realSourceFolder)
} else {
datasetFolders = append(datasetFolders, line)
if line == "" || string(line[0]) == "#" {
continue
}
// NOTE what is this special third level "data" folder that needs to be unsymlinked?
// convert into canonical form only for certain online data linked from eaccounts home directories
var parts = strings.Split(line, "/")
if len(parts) > 3 && parts[3] == "data" {
realSourceFolder, err := filepath.EvalSymlinks(line)
if err != nil {
log.Fatalf("Failed to find canonical form of sourceFolder:%v %v\n", line, err)
}
color.Set(color.FgYellow)
log.Printf("Transform sourceFolder %v to canonical form: %v\n", line, realSourceFolder)
color.Unset()
datasetPaths = append(datasetPaths, realSourceFolder)
} else {
datasetPaths = append(datasetPaths, line)
}
}
}
// log.Printf("Selected folders: %v\n", folders)

// test if a sourceFolder already used in the past and give warning
datasetIngestor.TestForExistingSourceFolder(datasetFolders, client, APIServer, user["accessToken"], allowExistingSourceFolderPtr)
foundList, err := datasetIngestor.TestForExistingSourceFolder(datasetPaths, client, APIServer, user["accessToken"])
if err != nil {
log.Fatal(err)
}
color.Set(color.FgYellow)
fmt.Println("Warning! The following datasets have been found with the same sourceFolder: ")
for _, element := range foundList {
fmt.Printf(" - PID: \"%s\", sourceFolder: \"%s\"\n", element.Pid, element.SourceFolder)
}
color.Unset()
if !allowExistingSourceFolder && len(foundList) > 0 {
if cmd.Flags().Changed("allowexistingsource") {
log.Printf("Do you want to ingest the corresponding new datasets nevertheless (y/N) ? ")
scanner.Scan()
archiveAgain := scanner.Text()
if archiveAgain != "y" {
log.Fatalln("Aborted.")
}
} else {
log.Fatalln("Existing sourceFolders are not allowed. Aborted.")
}
}

// TODO ask archive system if sourcefolder is known to them. If yes no copy needed, otherwise
// a destination location is defined by the archive system
Expand All @@ -220,7 +240,7 @@ For Windows you need instead to specify -user username:password on the command l
// now everything is prepared, start to loop over all folders
var skip = ""
// check if skip flag is globally defined via flags:
if cmd.Flags().Lookup("linkfiles").Changed {
if cmd.Flags().Changed("linkfiles") {
switch linkfiles {
case "delete":
skip = "sA"
Expand All @@ -232,17 +252,20 @@ For Windows you need instead to specify -user username:password on the command l
}

var datasetList []string
for _, sourceFolder := range datasetFolders {
for _, datasetSourceFolder := range datasetPaths {
// ignore empty lines
if sourceFolder == "" {
if datasetSourceFolder == "" {
// NOTE if there are empty source folder(s), shouldn't we raise an error?
continue
}
metaDataMap["sourceFolder"] = sourceFolder
log.Printf("Scanning files in dataset %s", sourceFolder)
metaDataMap["sourceFolder"] = datasetSourceFolder
log.Printf("Scanning files in dataset %s", datasetSourceFolder)

fullFileArray, startTime, endTime, owner, numFiles, totalSize :=
datasetIngestor.AssembleFilelisting(sourceFolder, filelistingPath, &skip)
fullFileArray, startTime, endTime, owner, numFiles, totalSize, err :=
datasetIngestor.GetLocalFileList(datasetSourceFolder, datasetFileListTxt, &skip)
if err != nil {
log.Fatalf("Can't gather the filelist of \"%s\"", datasetSourceFolder)
}
//log.Printf("full fileListing: %v\n Start and end time: %s %s\n ", fullFileArray, startTime, endTime)
log.Printf("The dataset contains %v files with a total size of %v bytes.", numFiles, totalSize)

Expand All @@ -268,10 +291,10 @@ For Windows you need instead to specify -user username:password on the command l
// and unless copy flag defined via command line
if !copyFlag && !nocopyFlag { // NOTE this whole copyFlag, nocopyFlag ordeal makes no sense whatsoever
if !beamlineAccount {
err := datasetIngestor.CheckDataCentrallyAvailable(user["username"], RSYNCServer, sourceFolder)
err := datasetIngestor.CheckDataCentrallyAvailable(user["username"], RSYNCServer, datasetSourceFolder)
if err != nil {
color.Set(color.FgYellow)
log.Printf("The source folder %v is not centrally available (decentral use case).\nThe data must first be copied to a rsync cache server.\n ", sourceFolder)
log.Printf("The source folder %v is not centrally available (decentral use case).\nThe data must first be copied to a rsync cache server.\n ", datasetSourceFolder)
color.Unset()
copyFlag = true
// check if user account
Expand Down Expand Up @@ -323,7 +346,7 @@ For Windows you need instead to specify -user username:password on the command l
datasetIngestor.AddAttachment(client, APIServer, datasetId, metaDataMap, user["accessToken"], addAttachment, addCaption)
}
if copyFlag {
err := datasetIngestor.SyncDataToFileserver(datasetId, user, RSYNCServer, sourceFolder, absFileListing)
err := datasetIngestor.SyncDataToFileserver(datasetId, user, RSYNCServer, datasetSourceFolder, absFileListing)
if err == nil {
// delayed enabling
archivable = true
Expand Down Expand Up @@ -402,4 +425,5 @@ func init() {
datasetIngestorCmd.Flags().String("addcaption", "", "Optional caption to be stored with attachment (single dataset case only)")

datasetIngestorCmd.MarkFlagsMutuallyExclusive("testenv", "devenv", "localenv", "tunnelenv")
//datasetIngestorCmd.MarkFlagsMutuallyExclusive("nocopy", "copy")
}
Loading