diff --git a/doc/kipc.adoc b/doc/kipc.adoc index 764973a7b8..7db2f3607a 100644 --- a/doc/kipc.adoc +++ b/doc/kipc.adoc @@ -290,6 +290,69 @@ returned by a call to `get_task_dump_region` for the specified task. A copy of the memory referred to by the specified region, starting at `base` and running for `size` bytes. +=== `find_faulted_task` (8) + +Scans forward from a given task index searching for a faulted task. If a faulted +task is found, returns its index. Otherwise, returns zero, the index of the +supervisor task, which is by definition not faulted. + +To simplify supervisor implementations, the given task index may equal the +number of tasks in the system. In this case, there are no tasks to search, and +you always get zero back. + +We do _not_ permit greater indices than that, because that would strongly +suggest a bug. + +==== Request + +[source,rust] +---- +struct FindFaultedTaskRequest { + starting_index: u32, +} +---- + +==== Preconditions + +The `starting_index` must be a valid index for this system, or one greater. + +==== Response + +[source,rust] +---- +struct FindFaultedTaskResponse { + fault_index: u32, +} +---- + +==== Notes + +This is intended for the common case of a supervisor scanning the task table in +search of a fault. Compared to doing this manually with `read_task_status`, +`find_faulted_task` has several advantages: + +1. Generates one kipc per failed task, plus one, rather than one kipc per + potentially failed task. + +2. Avoids serializing/transferring/deserializing the relatively complex + `TaskState` type, which supervisors rarely make use of in practice. + +3. Can be implemented in a way that doesn't potentially panic the supervisor. + +The "task ID or zero" return value is represented as an `Option` +in the Rust API, so a typical use of this kipc looks like: + +[source,rust] +---- +let mut next_task = 1; // skip supervisor +while let Some(fault) = kipc::find_faulted_task(next_task) { + let fault = usize::from(fault); + // do things with the faulted task + + next_task = fault + 1; +} +---- + == Receiving from the kernel The kernel never sends messages to tasks. It's simply not equipped to do so. diff --git a/sys/abi/src/lib.rs b/sys/abi/src/lib.rs index 4563b4b438..dd00782e43 100644 --- a/sys/abi/src/lib.rs +++ b/sys/abi/src/lib.rs @@ -490,6 +490,7 @@ pub enum Kipcnum { GetTaskDumpRegion = 6, ReadTaskDumpRegion = 7, SoftwareIrq = 8, + FindFaultedTask = 9, } impl core::convert::TryFrom for Kipcnum { @@ -505,6 +506,7 @@ impl core::convert::TryFrom for Kipcnum { 6 => Ok(Self::GetTaskDumpRegion), 7 => Ok(Self::ReadTaskDumpRegion), 8 => Ok(Self::SoftwareIrq), + 9 => Ok(Self::FindFaultedTask), _ => Err(()), } } diff --git a/sys/kern/src/kipc.rs b/sys/kern/src/kipc.rs index 5e6f6fc501..8becf24e37 100644 --- a/sys/kern/src/kipc.rs +++ b/sys/kern/src/kipc.rs @@ -39,6 +39,9 @@ pub fn handle_kernel_message( read_task_dump_region(tasks, caller, args.message?, args.response?) } Ok(Kipcnum::SoftwareIrq) => software_irq(tasks, caller, args.message?), + Ok(Kipcnum::FindFaultedTask) => { + find_faulted_task(tasks, caller, args.message?, args.response?) + } _ => { // Task has sent an unknown message to the kernel. That's bad. @@ -465,3 +468,37 @@ fn software_irq( tasks[caller].save_mut().set_send_response_and_length(0, 0); Ok(NextTask::Same) } + +fn find_faulted_task( + tasks: &mut [Task], + caller: usize, + message: USlice, + response: USlice, +) -> Result { + if caller != 0 { + return Err(UserError::Unrecoverable(FaultInfo::SyscallUsage( + UsageError::NotSupervisor, + ))); + } + + let index = deserialize_message::(&tasks[caller], message)? as usize; + + // Note: we explicitly permit index == tasks.len(), which causes us to wrap + // and end the search. + if index > tasks.len() { + return Err(UserError::Unrecoverable(FaultInfo::SyscallUsage( + UsageError::TaskOutOfRange, + ))); + } + let i = tasks[index..].iter().enumerate() + .find(|(_, task)| matches!(task.state(), TaskState::Faulted { .. })) + .map(|(i, _)| i) + .unwrap_or(0); + + let response_len = + serialize_response(&mut tasks[caller], response, &(i as u32))?; + tasks[caller] + .save_mut() + .set_send_response_and_length(0, response_len); + return Ok(NextTask::Same); +} diff --git a/sys/userlib/src/kipc.rs b/sys/userlib/src/kipc.rs index b153ca2913..aefea7c46d 100644 --- a/sys/userlib/src/kipc.rs +++ b/sys/userlib/src/kipc.rs @@ -4,6 +4,8 @@ //! Operations implemented by IPC with the kernel task. +use core::num::NonZeroUsize; + use abi::{Kipcnum, TaskId}; use zerocopy::AsBytes; @@ -24,6 +26,33 @@ pub fn read_task_status(task: usize) -> abi::TaskState { ssmarshal::deserialize(&response[..len]).unwrap_lite().0 } +/// Scans forward from index `task` looking for a task in faulted state. +/// +/// If no tasks at `task` or greater indices are faulted, this returns `None`. +/// +/// If a faulted task at index `i` is found, returns `Some(i)`. +/// +/// `task` may equal the number of tasks in the system (i.e. a one-past-the-end +/// index). In that case, this returns `None` every time. Larger values will get +/// you killed. +/// +/// The return value is a `NonZeroUsize` because this can't ever return zero, +/// since that would mean the supervisor (presumably the caller of this +/// function!) is in faulted state. +pub fn find_faulted_task(task: usize) -> Option { + // Coerce `task` to a known size (Rust doesn't assume that usize == u32) + let task = task as u32; + let mut response = 0_u32; + let (_, _) = sys_send( + TaskId::KERNEL, + Kipcnum::FindFaultedTask as u16, + task.as_bytes(), + response.as_bytes_mut(), + &[], + ); + NonZeroUsize::new(response as usize) +} + pub fn get_task_dump_region( task: usize, region: usize, diff --git a/task/jefe/src/main.rs b/task/jefe/src/main.rs index f52855b16b..2d48acd1de 100644 --- a/task/jefe/src/main.rs +++ b/task/jefe/src/main.rs @@ -307,37 +307,36 @@ impl idol_runtime::NotificationHandler for ServerImpl<'_> { // one task to have faulted since we last looked, but it's somewhat // unlikely since a fault causes us to immediately preempt. In any // case, let's assume we might have to handle multiple tasks. - // - // TODO: it would be fantastic to have a way of finding this out in - // one syscall. - for (i, status) in self.task_states.iter_mut().enumerate() { + let mut next_task = 1; + while let Some(fault_index) = kipc::find_faulted_task(next_task) { + let fault_index = usize::from(fault_index); + next_task = fault_index + 1; + + let status = &mut self.task_states[fault_index]; + // If we're aware that this task is in a fault state, don't // bother making a syscall to enquire. if status.holding_fault { continue; } - if let abi::TaskState::Faulted { .. } = - kipc::read_task_status(i) + #[cfg(feature = "dump")] { - #[cfg(feature = "dump")] - { - // We'll ignore the result of dumping; it could fail - // if we're out of space, but we don't have a way of - // dealing with that right now. - // - // TODO: some kind of circular buffer? - _ = dump::dump_task(self.dump_areas, i); - } - - if status.disposition == Disposition::Restart { - // Stand it back up - kipc::restart_task(i, true); - } else { - // Mark this one off so we don't revisit it until - // requested. - status.holding_fault = true; - } + // We'll ignore the result of dumping; it could fail + // if we're out of space, but we don't have a way of + // dealing with that right now. + // + // TODO: some kind of circular buffer? + _ = dump::dump_task(self.dump_areas, fault_index); + } + + if status.disposition == Disposition::Restart { + // Stand it back up + kipc::restart_task(fault_index, true); + } else { + // Mark this one off so we don't revisit it until + // requested. + status.holding_fault = true; } } }