opiproject · mardim91 · Jul 12, 2024 · Jul 17, 2024 · Aug 22, 2024 · Sep 5, 2024
@@ -6,6 +6,7 @@
 package eventbus
 
 import (
+ "fmt"
  "log"
  "sort"
  "sync"
@@ -89,7 +90,7 @@
 
  subscriber := &Subscriber{
  Name: moduleName,
- Ch: make(chan interface{}, 1),
+ Ch: make(chan interface{}),
  Quit: make(chan bool),
  Priority: priority,
  }
@@ -128,10 +129,20 @@
 }
 
 // Publish api notifies the subscribers with certain eventType
-func (e *EventBus) Publish(objectData *ObjectData, subscriber *Subscriber) {
+func (e *EventBus) Publish(objectData *ObjectData, subscriber *Subscriber) error {
  e.publishL.RLock()
  defer e.publishL.RUnlock()
- subscriber.Ch <- objectData
+ var err error
+ // We need the default case here as if the subscriber is busy then we will not be able to sent to the subscriber channel
+ // and the Publish function will stuck. So the default case serves exctly this purpose.
+ select {
+ case subscriber.Ch <- objectData:
+ log.Printf("Publish(): Notification is sent to subscriber %s\n", subscriber.Name)
+ default:
+ log.Printf("Publish(): Channel for subsriber %s is busy. Notification not sent", subscriber.Name)
+ err = fmt.Errorf("channel is busy")
+ }
+ return err
 }
 
 // Unsubscribe the subscriber, which delete the subscriber(all resources will be washed out)

@@ -31,8 +31,9 @@
  objectType string
  resourceVersion string
  subIndex int
- retryTimer time.Duration
- subs []*eventbus.Subscriber
+ // systemTimer is used only when we want to retry a Task due to unavailability of the Subscriber or not receiving a TaskStatus
+ systemTimer time.Duration
+ subs []*eventbus.Subscriber
 }
 
 // TaskStatus holds info related to the status that has been received
@@ -60,6 +61,7 @@
  objectType: objectType,
  resourceVersion: resourceVersion,
  subIndex: 0,
+ systemTimer: 1 * time.Second,
  subs: subs,
  }
 }
@@ -94,13 +96,18 @@
 // StatusUpdated creates a task status and sends it for handling
 func (t *TaskManager) StatusUpdated(name, objectType, resourceVersion, notificationID string, dropTask bool, component *common.Component) {
  taskStatus := newTaskStatus(name, objectType, resourceVersion, notificationID, dropTask, component)
-
- // Do we need to make this call happen in a goroutine in order to not make the
- // StatusUpdated function stuck in case that nobody reads what is written on the channel ?
- // Is there any case where this can happen
- // (nobody reads what is written on the channel and the StatusUpdated gets stuck) ?
- t.taskStatusChan <- taskStatus
- log.Printf("StatusUpdated(): New Task Status has been created and sent to channel: %+v\n", taskStatus)
+ log.Printf("StatusUpdated(): New Task Status has been created: %+v\n", taskStatus)
+
+ // We need to have a default case here so the call is not stuck if we send to channel but there is nobody reading from the channel.
+ // e.g. a subscriber got stuck and doesn't reply. The task will be requeued after the timer gets expired. In the meanwhile
+ // the subscriber replies and a taskStatus is sent to channel but the call gets stuck there as the previous task has not been requeued yet
+ // as the timer has not expired and the queue is empty (We assume that there is only one task in the queue).
+ select {
+ case t.taskStatusChan <- taskStatus:
+ log.Printf("StatusUpdated(): Task Status has been sent to channel: %+v\n", taskStatus)
+ default:
+ log.Printf("StatusUpdated(): Task Status has not been sent to channel. Channel not available: %+v\n", taskStatus)
+ }
 }
 
 // processTasks processes the task
@@ -123,7 +130,18 @@
  // (e.g. Maybe you have a timeout on the subscribers and you got the notification after the timeout have passed)
  NotificationID: uuid.NewString(),
  }
- eventbus.EBus.Publish(objectData, sub)
+ if err := eventbus.EBus.Publish(objectData, sub); err != nil {
+ log.Printf("processTasks(): Notification not sent to subscriber %+v with data %+v. Subscriber is busy. The Task %+v will be requeued.\n", sub, objectData, task)
+ // We keep this subIndex in order to know from which subscriber to start iterating after the requeue of the Task
+ // so we do start again from the subscriber that returned an error or was unavailable for any reason.
+ task.subIndex += i
+ task.systemTimer *= 2
 time.AfterFunc(taskStatus.component.Timer, func() { 
 time.AfterFunc(taskStatus.component.Timer, func() { 
+ log.Printf("processTasks(): The Task will be requeued after %+v\n", task.systemTimer)
+ time.AfterFunc(task.systemTimer, func() {
+ t.taskQueue.Enqueue(task)
+ })
+ break loopTwo
+ }
  log.Printf("processTasks(): Notification has been sent to subscriber %+v with data %+v\n", sub, objectData)
 
  loopThree:
@@ -143,11 +161,17 @@
  log.Printf("processTasks(): received notification id %+v doesn't equal the sent notification id %+v\n", taskStatus.notificationID, objectData.NotificationID)
 
  // We need a timeout in case that the subscriber doesn't update the status at all for whatever reason.
- // If that occurs then we just take a note which subscriber need to revisit and we requeue the task without any timer
+ // If that occurs then we just requeue the task with a timer
  case <-time.After(30 * time.Second):
- log.Printf("processTasks(): No task status has been received in the channel from subscriber %+v. The task %+v will be requeued immediately Task Status %+v\n", sub, task, taskStatus)
+ log.Printf("processTasks(): No task status has been received in the channel from subscriber %+v. The task %+v will be requeued. Task Status %+v\n", sub, task, taskStatus)
+ // We keep this subIndex in order to know from which subscriber to start iterating after the requeue of the Task
+ // so we do start again from the subscriber that returned an error or was unavailable for any reason.
  task.subIndex += i
- go t.taskQueue.Enqueue(task)
+ task.systemTimer *= 2
+ log.Printf("processTasks(): The Task will be requeued after %+v\n", task.systemTimer)
+ time.AfterFunc(task.systemTimer, func() {
+ t.taskQueue.Enqueue(task)
+ })
  break loopThree
  }
  }
@@ -159,19 +183,27 @@
  break loopTwo
  }
 
+ // We re-initialize the systemTimer every time that we get a taskStatus. That means that the subscriber is available and has responded
+ task.systemTimer = 1 * time.Second
+
  switch taskStatus.component.CompStatus {
  case common.ComponentStatusSuccess:
  log.Printf("processTasks(): Subscriber %+v has processed the task %+v successfully\n", sub, task)
  continue loopTwo
- default:
+ case common.ComponentStatusError:
  log.Printf("processTasks(): Subscriber %+v has not processed the task %+v successfully\n", sub, task)
+ log.Printf("processTasks(): The Task will be requeued after %+v\n", taskStatus.component.Timer)
+ // We keep this subIndex in order to know from which subscriber to start iterating after the requeue of the Task
+ // so we do start again from the subscriber that returned an error or was unavailable for any reason.
  task.subIndex += i
- task.retryTimer = taskStatus.component.Timer
- log.Printf("processTasks(): The Task will be requeued after %+v\n", task.retryTimer)
- time.AfterFunc(task.retryTimer, func() {
+ time.AfterFunc(taskStatus.component.Timer, func() {
  t.taskQueue.Enqueue(task)
  })
  break loopTwo
+ default:
+ log.Printf("processTasks(): Subscriber %+v has not provided designated status for the task %+v\n", sub, task)
+ log.Printf("processTasks(): The task %+v will be dropped\n", task)
+ break loopTwo
  }
  }
  }