Skip to content

Commit

Permalink
Use label for ost0 and move OST0 ordering to NnfStorage controller
Browse files Browse the repository at this point in the history
Signed-off-by: Blake Devcich <[email protected]>
  • Loading branch information
bdevcich committed Nov 13, 2024
1 parent f4bd5c1 commit f3983e8
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 50 deletions.
2 changes: 2 additions & 0 deletions api/v1alpha3/nnfstorage_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ import (

const (
AllocationSetLabel = "nnf.cray.hpe.com/allocationset"
// TODO:
// AllocationSetOST0Label = "nnf.cray.hpe.com/allocationset_ost0"
)

// NnfStorageAllocationNodes identifies the node and properties of the allocation to make on that node
Expand Down
98 changes: 49 additions & 49 deletions internal/controller/nnf_node_storage_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (

"github.com/go-logr/logr"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
kruntime "k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
Expand All @@ -40,7 +39,6 @@ import (

dwsv1alpha2 "github.com/DataWorkflowServices/dws/api/v1alpha2"
"github.com/DataWorkflowServices/dws/utils/updater"
"github.com/NearNodeFlash/nnf-sos/api/v1alpha3"
nnfv1alpha3 "github.com/NearNodeFlash/nnf-sos/api/v1alpha3"
"github.com/NearNodeFlash/nnf-sos/internal/controller/metrics"
)
Expand Down Expand Up @@ -243,60 +241,60 @@ func (r *NnfNodeStorageReconciler) deleteAllocation(ctx context.Context, nnfNode
}

// For Lustre, wait until OST0 is gone first so that PreUnmount can run before any allocations are deleted
doPreUnmount := false
lustreOST0 := nnfNodeStorage.Spec.FileSystemType == "lustre" && nnfNodeStorage.Spec.LustreStorage.TargetType == "ost" && nnfNodeStorage.Spec.LustreStorage.StartIndex == 0
if lustreOST0 || nnfNodeStorage.Spec.FileSystemType != "lustre" {
doPreUnmount = true

} else if nnfNodeStorage.Spec.FileSystemType == "lustre" {
waitForOST0Deletion := func() (bool, error) {
// Get the owner and directive index from labels
ownerKind, ownerExists := nnfNodeStorage.Labels[dwsv1alpha2.OwnerKindLabel]
ownerName, ownerNameExists := nnfNodeStorage.Labels[dwsv1alpha2.OwnerNameLabel]
ownerNS, ownerNSExists := nnfNodeStorage.Labels[dwsv1alpha2.OwnerNamespaceLabel]
if !ownerExists || !ownerNameExists || !ownerNSExists || ownerKind != "NnfStorage" {
return false, dwsv1alpha2.NewResourceError("expected NnfNodeStorage owner to be of kind NnfStorage and have the expected labels").WithMajor()
}
/*
doPreUnmount := false
if lustreOST0 || nnfNodeStorage.Spec.FileSystemType != "lustre" {
doPreUnmount = true
} else if nnfNodeStorage.Spec.FileSystemType == "lustre" {
waitForOST0Deletion := func() (bool, error) {
// Get the owner and directive index from labels
ownerKind, ownerExists := nnfNodeStorage.Labels[dwsv1alpha2.OwnerKindLabel]
ownerName, ownerNameExists := nnfNodeStorage.Labels[dwsv1alpha2.OwnerNameLabel]
ownerNS, ownerNSExists := nnfNodeStorage.Labels[dwsv1alpha2.OwnerNamespaceLabel]
if !ownerExists || !ownerNameExists || !ownerNSExists || ownerKind != "NnfStorage" {
return false, dwsv1alpha2.NewResourceError("expected NnfNodeStorage owner to be of kind NnfStorage and have the expected labels").WithMajor()
}
// Get the owner
storage := &v1alpha3.NnfStorage{ObjectMeta: metav1.ObjectMeta{
Name: ownerName,
Namespace: ownerNS,
}}
if err := r.Get(ctx, client.ObjectKeyFromObject(storage), storage); err != nil {
return false, dwsv1alpha2.NewResourceError("unable retrieve NnfStorage resource").WithError(err).WithMajor()
}
// Get the owner
storage := &v1alpha3.NnfStorage{ObjectMeta: metav1.ObjectMeta{
Name: ownerName,
Namespace: ownerNS,
}}
if err := r.Get(ctx, client.ObjectKeyFromObject(storage), storage); err != nil {
return false, dwsv1alpha2.NewResourceError("unable retrieve NnfStorage resource").WithError(err).WithMajor()
}
// Get al the NnfNodeStorages for the OSTs
nnfNodeStorageList := &nnfv1alpha3.NnfNodeStorageList{}
matchLabels := dwsv1alpha2.MatchingOwner(storage)
matchLabels[nnfv1alpha3.AllocationSetLabel] = "ost"
// Get al the NnfNodeStorages for the OSTs
nnfNodeStorageList := &nnfv1alpha3.NnfNodeStorageList{}
matchLabels := dwsv1alpha2.MatchingOwner(storage)
matchLabels[nnfv1alpha3.AllocationSetLabel] = "ost"
listOptions := []client.ListOption{
matchLabels,
}
listOptions := []client.ListOption{
matchLabels,
}
if err := r.List(ctx, nnfNodeStorageList, listOptions...); err != nil {
return false, dwsv1alpha2.NewResourceError("could not list NnfNodeStorages").WithError(err)
}
if err := r.List(ctx, nnfNodeStorageList, listOptions...); err != nil {
return false, dwsv1alpha2.NewResourceError("could not list NnfNodeStorages").WithError(err)
}
// wait until OST0 no longer exists
for _, nnfNodeStorage := range nnfNodeStorageList.Items {
if nnfNodeStorage.Spec.LustreStorage.StartIndex == 0 {
return false, nil
// wait until OST0 no longer exists
for _, nnfNodeStorage := range nnfNodeStorageList.Items {
if nnfNodeStorage.Spec.LustreStorage.StartIndex == 0 {
return false, nil
}
}
}
return true, nil
}
return true, nil
}
// Wait for OST0 to be deleted first so it can run PreUnmount
if deleted, err := waitForOST0Deletion(); err != nil {
return nil, dwsv1alpha2.NewResourceError("failed to wait for lustre OST0 deletion").WithError(err).WithMajor()
} else if !deleted {
return &ctrl.Result{Requeue: true}, nil
}
}
// Wait for OST0 to be deleted first so it can run PreUnmount
if deleted, err := waitForOST0Deletion(); err != nil {
return nil, dwsv1alpha2.NewResourceError("failed to wait for lustre OST0 deletion").WithError(err).WithMajor()
} else if !deleted {
return &ctrl.Result{Requeue: true}, nil
}
}*/

if blockDeviceExists && nnfNodeStorage.Status.Allocations[index].Ready {
ran, err := blockDevice.Activate(ctx)
Expand All @@ -315,7 +313,9 @@ func (r *NnfNodeStorageReconciler) deleteAllocation(ctx context.Context, nnfNode
log.Info("Activated file system", "allocation", index)
}

if doPreUnmount {
// if doPreUnmount {
lustreOST0 := nnfNodeStorage.Spec.FileSystemType == "lustre" && nnfNodeStorage.Spec.LustreStorage.TargetType == "ost" && nnfNodeStorage.Spec.LustreStorage.StartIndex == 0
if lustreOST0 || nnfNodeStorage.Spec.FileSystemType != "lustre" {
ran, err = fileSystem.PreUnmount(ctx)
if err != nil {
return nil, dwsv1alpha2.NewResourceError("could not run pre unmount for file system").WithError(err).WithMajor()
Expand Down
41 changes: 40 additions & 1 deletion internal/controller/nnf_storage_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,10 @@ func (r *NnfStorageReconciler) createNodeStorage(ctx context.Context, storage *n

labels := nnfNodeStorage.GetLabels()
labels[nnfv1alpha3.AllocationSetLabel] = allocationSet.Name
if lustreOST && startIndex == 0 {
// TODO: use label from API
labels["nnf.cray.hpe.com/allocationset_ost0"] = "true"
}
nnfNodeStorage.SetLabels(labels)

nnfNodeStorage.Spec.BlockReference = corev1.ObjectReference{
Expand Down Expand Up @@ -1082,6 +1086,13 @@ func (r *NnfStorageReconciler) teardownStorage(ctx context.Context, storage *nnf
&nnfv1alpha3.NnfNodeStorageList{},
}

// Delete OST0 first so that PreUnmount commands can happen
// TODO: use label from API
ost0DeleteStatus, err := dwsv1alpha2.DeleteChildrenWithLabels(ctx, r.Client, childObjects, storage, client.MatchingLabels{"nnf.cray.hpe.com/allocationset_ost0": "true"})
if err != nil {
return nodeStoragesExist, err
}

ostDeleteStatus, err := dwsv1alpha2.DeleteChildrenWithLabels(ctx, r.Client, childObjects, storage, client.MatchingLabels{nnfv1alpha3.AllocationSetLabel: "ost"})
if err != nil {
return nodeStoragesExist, err
Expand All @@ -1101,7 +1112,7 @@ func (r *NnfStorageReconciler) teardownStorage(ctx context.Context, storage *nnf
}
}

if !ostDeleteStatus.Complete() || !mdtDeleteStatus.Complete() {
if !ost0DeleteStatus.Complete() || !ostDeleteStatus.Complete() || !mdtDeleteStatus.Complete() {
return nodeStoragesExist, nil
}

Expand Down Expand Up @@ -1221,6 +1232,34 @@ func nnfNodeStorageName(storage *nnfv1alpha3.NnfStorage, allocationSetIndex int,
return storage.Namespace + "-" + storage.Name + "-" + storage.Spec.AllocationSets[allocationSetIndex].Name + "-" + strconv.Itoa(duplicateRabbitIndex)
}

// Get the NnfNodeStorage for Lustre OST0 for a given NnfStorage
func (r *NnfStorageReconciler) getLustreOST0(ctx context.Context, storage *nnfv1alpha3.NnfStorage) (*nnfv1alpha3.NnfNodeStorage, error) {
if storage.Spec.FileSystemType != "lustre" {
return nil, nil
}

// Get al the NnfNodeStorages for the OSTs
nnfNodeStorageList := &nnfv1alpha3.NnfNodeStorageList{}
matchLabels := dwsv1alpha2.MatchingOwner(storage)
matchLabels[nnfv1alpha3.AllocationSetLabel] = "ost"

listOptions := []client.ListOption{
matchLabels,
}

if err := r.List(ctx, nnfNodeStorageList, listOptions...); err != nil {
return nil, dwsv1alpha2.NewResourceError("could not list NnfNodeStorages").WithError(err)
}

for _, nnfNodeStorage := range nnfNodeStorageList.Items {
if nnfNodeStorage.Spec.LustreStorage.StartIndex == 0 {
return &nnfNodeStorage, nil
}
}

return nil, nil
}

// SetupWithManager sets up the controller with the Manager.
func (r *NnfStorageReconciler) SetupWithManager(mgr ctrl.Manager) error {
r.ChildObjects = []dwsv1alpha2.ObjectList{
Expand Down

0 comments on commit f3983e8

Please sign in to comment.