diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 975ccb9b..e75f5890 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -3,6 +3,7 @@ on:
   push:
     branches:
       - main
+      - gd/ar
     tags: ["*"]
   pull_request:
 concurrency:
diff --git a/examples/autodiff.jl b/examples/autodiff.jl
index 75905b17..e498ed19 100644
--- a/examples/autodiff.jl
+++ b/examples/autodiff.jl
@@ -29,7 +29,8 @@ rng = StableRNG(63);
 To play around with automatic differentiation, we define a simple controlled HMM.
 =#
 
-struct DiffusionHMM{V1<:AbstractVector,M2<:AbstractMatrix,V3<:AbstractVector} <: AbstractHMM
+struct DiffusionHMM{V1<:AbstractVector,M2<:AbstractMatrix,V3<:AbstractVector} <:
+       AbstractHMM{false}
     init::V1
     trans::M2
     means::V3
diff --git a/examples/controlled.jl b/examples/controlled.jl
index ffd0ab98..a0698632 100644
--- a/examples/controlled.jl
+++ b/examples/controlled.jl
@@ -22,10 +22,10 @@ rng = StableRNG(63);
 
 #=
 A Markov switching regression is like a classical regression, except that the weights depend on the unobserved state of an HMM.
-We can represent it with the following subtype of `AbstractHMM` (see [Custom HMM structures](@ref)), which has one vector of coefficients $\beta_i$ per state.
+We can represent it with the following subtype of `AbstractHMM{false}` (see [Custom HMM structures](@ref)), which has one vector of coefficients $\beta_i$ per state.
 =#
 
-struct ControlledGaussianHMM{T} <: AbstractHMM
+struct ControlledGaussianHMM{T} <: AbstractHMM{false}
     init::Vector{T}
     trans::Matrix{T}
     dist_coeffs::Vector{Vector{T}}
diff --git a/examples/interfaces.jl b/examples/interfaces.jl
index 4498478f..6bd7f311 100644
--- a/examples/interfaces.jl
+++ b/examples/interfaces.jl
@@ -138,14 +138,14 @@ test_allocations(rng, hmm, control_seq; seq_ends, hmm_guess)  #src
 #=
 In some scenarios, the vanilla Baum-Welch algorithm is not exactly what we want.
 For instance, we might have a prior on the parameters of our model, which we want to apply during the fitting step of the iterative procedure.
-Then we need to create a new type that satisfies the `AbstractHMM` interface.
+Then we need to create a new type that satisfies the `AbstractHMM{ar}` interface.
 
 Let's make a simpler version of the built-in `HMM`, with a prior saying that each transition has already been observed a certain number of times.
 Such a prior can be very useful to regularize estimation and avoid numerical instabilities.
 It amounts to drawing every row of the transition matrix from a Dirichlet distribution, where each Dirichlet parameter is one plus the number of times the corresponding transition has been observed.
 =#
 
-struct PriorHMM{T,D} <: AbstractHMM
+struct PriorHMM{T,D} <: AbstractHMM{false}
     init::Vector{T}
     trans::Matrix{T}
     dists::Vector{D}
@@ -153,7 +153,7 @@ struct PriorHMM{T,D} <: AbstractHMM
 end
 
 #=
-The basic requirements for `AbstractHMM` are the following three functions: [`initialization`](@ref), [`transition_matrix`](@ref) and [`obs_distributions`](@ref).
+The basic requirements for `AbstractHMM{false}` are the following three functions: [`initialization`](@ref), [`transition_matrix`](@ref) and [`obs_distributions`](@ref).
 =#
 
 HiddenMarkovModels.initialization(hmm::PriorHMM) = hmm.init
diff --git a/examples/temporal.jl b/examples/temporal.jl
index 1cad38f6..cf5a1d1f 100644
--- a/examples/temporal.jl
+++ b/examples/temporal.jl
@@ -23,10 +23,10 @@ rng = StableRNG(63);
 #=
 We focus on the particular case of a periodic HMM with period `L`.
 It has only one initialization vector, but `L` transition matrices and `L` vectors of observation distributions.
-As in [Custom HMM structures](@ref), we need to subtype `AbstractHMM`.
+As in [Custom HMM structures](@ref), we need to subtype `AbstractHMM{ar}`.
 =#
 
-struct PeriodicHMM{T<:Number,D,L} <: AbstractHMM
+struct PeriodicHMM{T<:Number,D,L} <: AbstractHMM{false}
     init::Vector{T}
     trans_per::NTuple{L,Matrix{T}}
     dists_per::NTuple{L,Vector{D}}
@@ -100,7 +100,7 @@ vcat(obs_seq', best_state_seq')
 # ## Learning
 
 #=
-When estimating parameters for a custom subtype of `AbstractHMM`, we have to override the fitting procedure after forward-backward, with an additional `control_seq` positional argument.
+When estimating parameters for a custom subtype of `AbstractHMM{false}`, we have to override the fitting procedure after forward-backward, with an additional `control_seq` positional argument.
 The key is to split the observations according to which periodic parameter they belong to.
 =#
 
diff --git a/src/HiddenMarkovModels.jl b/src/HiddenMarkovModels.jl
index aa319be6..46b24da8 100644
--- a/src/HiddenMarkovModels.jl
+++ b/src/HiddenMarkovModels.jl
@@ -16,7 +16,7 @@ using ChainRulesCore: ChainRulesCore, NoTangent, RuleConfig, rrule_via_ad
 using DensityInterface: DensityInterface, DensityKind, HasDensity, NoDensity, logdensityof
 using DocStringExtensions
 using FillArrays: Fill
-using LinearAlgebra: Transpose, axpy!, dot, ldiv!, lmul!, mul!, parent
+using LinearAlgebra: Transpose, axpy!, dot, ldiv!, lmul!, mul!, parent, diagm
 using Random: Random, AbstractRNG, default_rng
 using SparseArrays: AbstractSparseArray, SparseMatrixCSC, nonzeros, nnz, nzrange, rowvals
 using StatsAPI: StatsAPI, fit, fit!
diff --git a/src/inference/forward.jl b/src/inference/forward.jl
index 5e6e2518..59ea410d 100644
--- a/src/inference/forward.jl
+++ b/src/inference/forward.jl
@@ -69,10 +69,11 @@ function _forward_digest_observation!(
     hmm::AbstractHMM,
     obs,
     control,
+    prev_obs,
 )
     a, b = current_state_marginals, current_obs_likelihoods
 
-    obs_logdensities!(b, hmm, obs, control)
+    obs_logdensities!(b, hmm, obs, control, prev_obs)
     logm = maximum(b)
     b .= exp.(b .- logm)
 
@@ -104,7 +105,10 @@ function _forward!(
             αₜ₋₁ = view(α, :, t - 1)
             predict_next_state!(αₜ, hmm, αₜ₋₁, control_seq[t - 1])
         end
-        cₜ, logLₜ = _forward_digest_observation!(αₜ, Bₜ, hmm, obs_seq[t], control_seq[t])
+        prev_obs = t == t1 ? missing : previous_obs(hmm, obs_seq, t)
+        cₜ, logLₜ = _forward_digest_observation!(
+            αₜ, Bₜ, hmm, obs_seq[t], control_seq[t], prev_obs
+        )
         c[t] = cₜ
         logL[k] += logLₜ
     end
diff --git a/src/inference/forward_backward.jl b/src/inference/forward_backward.jl
index ba51ad46..973626cf 100644
--- a/src/inference/forward_backward.jl
+++ b/src/inference/forward_backward.jl
@@ -10,7 +10,7 @@ function initialize_forward_backward(
 )
     N, T, K = length(hmm), length(obs_seq), length(seq_ends)
     R = eltype(hmm, obs_seq[1], control_seq[1])
-    trans = transition_matrix(hmm, control_seq[1])
+    trans = transition_matrix(hmm, control_seq[2])
     M = typeof(similar(trans, R))
 
     γ = Matrix{R}(undef, N, T)
diff --git a/src/inference/logdensity.jl b/src/inference/logdensity.jl
index ce153ff2..6a73a024 100644
--- a/src/inference/logdensity.jl
+++ b/src/inference/logdensity.jl
@@ -37,9 +37,12 @@ function joint_logdensityof(
             trans = transition_matrix(hmm, control_seq[t])
             logL += log(trans[state_seq[t], state_seq[t + 1]])
         end
-        # Observations
-        for t in t1:t2
-            dists = obs_distributions(hmm, control_seq[t])
+        # Priori: P(Y_{1}|X_{1},U_{1})
+        dists = obs_distributions(hmm, control_seq[t1], missing)
+        logL += logdensityof(dists[state_seq[t1]], obs_seq[t1])
+        # Observations: P(Y_{t}|Y_{t-1},X_{t},U_{t})
+        for t in (t1 + 1):t2
+            dists = obs_distributions(hmm, control_seq[t], previous_obs(hmm, obs_seq, t))
             logL += logdensityof(dists[state_seq[t]], obs_seq[t])
         end
     end
diff --git a/src/inference/viterbi.jl b/src/inference/viterbi.jl
index f4f52701..58c9a887 100644
--- a/src/inference/viterbi.jl
+++ b/src/inference/viterbi.jl
@@ -48,13 +48,15 @@ function _viterbi!(
     t1, t2 = seq_limits(seq_ends, k)
 
     logBₜ₁ = view(logB, :, t1)
-    obs_logdensities!(logBₜ₁, hmm, obs_seq[t1], control_seq[t1])
+    obs_logdensities!(logBₜ₁, hmm, obs_seq[t1], control_seq[t1], missing)
     loginit = log_initialization(hmm)
     ϕ[:, t1] .= loginit .+ logBₜ₁
 
     for t in (t1 + 1):t2
         logBₜ = view(logB, :, t)
-        obs_logdensities!(logBₜ, hmm, obs_seq[t], control_seq[t])
+        obs_logdensities!(
+            logBₜ, hmm, obs_seq[t], control_seq[t], previous_obs(hmm, obs_seq, t)
+        )
         logtrans = log_transition_matrix(hmm, control_seq[t - 1])
         ϕₜ, ϕₜ₋₁ = view(ϕ, :, t), view(ϕ, :, t - 1)
         ψₜ = view(ψ, :, t)
diff --git a/src/types/abstract_hmm.jl b/src/types/abstract_hmm.jl
index 4fe1387c..7e56d206 100644
--- a/src/types/abstract_hmm.jl
+++ b/src/types/abstract_hmm.jl
@@ -23,7 +23,7 @@ Any `AbstractHMM` which satisfies the interface can be given to the following fu
 - [`forward_backward`](@ref)
 - [`baum_welch`](@ref) (if `[fit!](@ref)` is implemented)
 """
-abstract type AbstractHMM end
+abstract type AbstractHMM{ar} end
 
 @inline DensityInterface.DensityKind(::AbstractHMM) = HasDensity()
 
@@ -46,8 +46,8 @@ It is typically a promotion between the element type of the initialization, the
 function Base.eltype(hmm::AbstractHMM, obs, control)
     init_type = eltype(initialization(hmm))
     trans_type = eltype(transition_matrix(hmm, control))
-    dist = obs_distributions(hmm, control)[1]
-    logdensity_type = typeof(logdensityof(dist, obs))
+    dists = obs_distributions(hmm, control, obs)
+    logdensity_type = typeof(logdensityof(dists[1], obs))
     return promote_type(init_type, trans_type, logdensity_type)
 end
 
@@ -89,13 +89,13 @@ Falls back on `transition_matrix`.
 !!! note
     When processing sequences, the control at time `t` influences the transition from time `t` to `t+1` (and not from time `t-1` to `t`).
 """
-function log_transition_matrix(hmm::AbstractHMM, control)
-    return elementwise_log(transition_matrix(hmm, control))
-end
+log_transition_matrix(hmm::AbstractHMM, control) =
+    elementwise_log(transition_matrix(hmm, control))
 
 """
     obs_distributions(hmm)
     obs_distributions(hmm, control)
+    obs_distributions(hmm, control, obs)
 
 Return a vector of observation distributions, one for each state of `hmm` (possibly when `control` is applied).
 
@@ -109,9 +109,16 @@ function obs_distributions end
 
 ## Fallbacks for no control
 
+initialization(hmm::AbstractHMM, ::Nothing) = initialization(hmm)
 transition_matrix(hmm::AbstractHMM, ::Nothing) = transition_matrix(hmm)
 log_transition_matrix(hmm::AbstractHMM, ::Nothing) = log_transition_matrix(hmm)
 obs_distributions(hmm::AbstractHMM, ::Nothing) = obs_distributions(hmm)
+function obs_distributions(hmm::AbstractHMM, control, ::Any)
+    return obs_distributions(hmm, control)
+end
+
+previous_obs(::AbstractHMM{false}, obs_seq::AbstractVector, t::Integer) = nothing
+previous_obs(::AbstractHMM{true}, obs_seq::AbstractVector, t::Integer) = obs_seq[t - 1]
 
 """
     StatsAPI.fit!(
@@ -128,9 +135,9 @@ StatsAPI.fit!
 ## Fill logdensities
 
 function obs_logdensities!(
-    logb::AbstractVector{T}, hmm::AbstractHMM, obs, control
+    logb::AbstractVector{T}, hmm::AbstractHMM, obs, control, prev_obs
 ) where {T}
-    dists = obs_distributions(hmm, control)
+    dists = obs_distributions(hmm, control, prev_obs)
     @simd for i in eachindex(logb, dists)
         logb[i] = logdensityof(dists[i], obs)
     end
@@ -164,13 +171,13 @@ function Random.rand(rng::AbstractRNG, hmm::AbstractHMM, control_seq::AbstractVe
         )
     end
 
-    dists1 = obs_distributions(hmm, control_seq[1])
+    dists1 = obs_distributions(hmm, control_seq[1], missing)
     obs1 = rand(rng, dists1[state1])
     obs_seq = Vector{typeof(obs1)}(undef, T)
     obs_seq[1] = obs1
 
     for t in 2:T
-        dists = obs_distributions(hmm, control_seq[t])
+        dists = obs_distributions(hmm, control_seq[t], previous_obs(hmm, obs_seq, t))
         obs_seq[t] = rand(rng, dists[state_seq[t]])
     end
     return (; state_seq=state_seq, obs_seq=obs_seq)
diff --git a/src/types/hmm.jl b/src/types/hmm.jl
index 9c1f47b4..9ecbe1ad 100644
--- a/src/types/hmm.jl
+++ b/src/types/hmm.jl
@@ -13,7 +13,7 @@ struct HMM{
     VD<:AbstractVector,
     Vl<:AbstractVector,
     Ml<:AbstractMatrix,
-} <: AbstractHMM
+} <: AbstractHMM{false}
     "initial state probabilities"
     init::V
     "state transition probabilities"
diff --git a/test/autoregressive.jl b/test/autoregressive.jl
new file mode 100644
index 00000000..9b697a6c
--- /dev/null
+++ b/test/autoregressive.jl
@@ -0,0 +1,20 @@
+using Distributions
+using HiddenMarkovModels
+const HMMs = HiddenMarkovModels
+
+struct AutoRegressiveGaussianHMM{T} <: AbstractHMM{true}
+    init::Vector{T}
+    trans::Matrix{T}
+    a::Vector{T}
+    b::Vector{T}
+end
+
+const ARGHMM = AutoRegressiveGaussianHMM
+
+HMMs.initialization(hmm::ARGHMM) = hmm.init
+HMMs.transition_matrix(hmm::ARGHMM) = hmm.trans
+
+function HMMs.obs_distributions(hmm::ARGHMM, _control, prev_obs)
+    (; a, b) = hmm
+    return [Normal(a[i] * prev_obs + b[i], 1.0) for i in 1:length(hmm)]
+end
diff --git a/test/discretecontrolarhmm.jl b/test/discretecontrolarhmm.jl
new file mode 100644
index 00000000..1a5b5233
--- /dev/null
+++ b/test/discretecontrolarhmm.jl
@@ -0,0 +1,26 @@
+using Distributions
+using HiddenMarkovModels
+const HMMs = HiddenMarkovModels
+
+struct DiscreteCARHMM{T<:Number} <: AbstractHMM{true}
+    # Initial distribution P(X_{1}|U_{1}), one vector for each control
+    init::Vector{Vector{T}}
+    # Transition matrix P(X_{t}|X_{t-1}, U_{t}), one matrix for each control
+    trans::Vector{Matrix{T}}
+    # Emission matriz P(Y_{t}|X_{t}, U_{t}), one matriz for each control and each possible observation
+    dists::Vector{Vector{Matrix{T}}}
+    # Prior Distribution for P(Y_{1}|X_{1}, U_{1}), one matriz for each control
+    prior::Vector{Matrix{T}}
+end
+
+HMMs.initialization(hmm::DiscreteCARHMM, control) = hmm.init[control]
+
+HMMs.transition_matrix(hmm::DiscreteCARHMM, control) = hmm.trans[control]
+
+function HMMs.obs_distributions(hmm::DiscreteCARHMM, control, prev_obs)
+    return [Categorical(hmm.dists[control][prev_obs][i, :]) for i in 1:size(hmm, control)]
+end
+
+function HMMs.obs_distributions(hmm::DiscreteCARHMM, control, ::Missing)
+    return [Categorical(hmm.prior[control][i, :]) for i in 1:size(hmm, control)]
+end