Stationary environments #
def
Learning.stationaryEnv
{α : Type u_1}
{R : Type u_2}
{mα : MeasurableSpace α}
{mR : MeasurableSpace R}
(ν : ProbabilityTheory.Kernel α R)
[ProbabilityTheory.IsMarkovKernel ν]
:
Environment α R
A stationary environment, in which the distribution of the next reward depends only on the last action.
Equations
- Learning.stationaryEnv ν = { feedback := fun (x : ℕ) => ProbabilityTheory.Kernel.prodMkLeft (↥(Finset.Iic x) → α × R) ν, h_feedback := ⋯, ν0 := ν, hp0 := inst✝ }
Instances For
@[simp]
theorem
Learning.stationaryEnv_ν0
{α : Type u_1}
{R : Type u_2}
{mα : MeasurableSpace α}
{mR : MeasurableSpace R}
(ν : ProbabilityTheory.Kernel α R)
[ProbabilityTheory.IsMarkovKernel ν]
:
@[simp]
theorem
Learning.stationaryEnv_feedback
{α : Type u_1}
{R : Type u_2}
{mα : MeasurableSpace α}
{mR : MeasurableSpace R}
(ν : ProbabilityTheory.Kernel α R)
[ProbabilityTheory.IsMarkovKernel ν]
(x✝ : ℕ)
:
theorem
Learning.IsAlgEnvSeq.condDistrib_reward_stationaryEnv
{α : Type u_1}
{R : Type u_2}
{mα : MeasurableSpace α}
{mR : MeasurableSpace R}
{Ω : Type u_3}
{mΩ : MeasurableSpace Ω}
[StandardBorelSpace α]
[Nonempty α]
[StandardBorelSpace R]
[Nonempty R]
{alg : Algorithm α R}
{ν : ProbabilityTheory.Kernel α R}
[ProbabilityTheory.IsMarkovKernel ν]
{P : MeasureTheory.Measure Ω}
[MeasureTheory.IsProbabilityMeasure P]
{A : ℕ → Ω → α}
{R' : ℕ → Ω → R}
(h : IsAlgEnvSeq A R' alg (stationaryEnv ν) P)
(n : ℕ)
:
The conditional distribution of the reward at time n given the action at time n is ν.
theorem
Learning.IsAlgEnvSeq.condIndepFun_reward_hist_action
{α : Type u_1}
{R : Type u_2}
{mα : MeasurableSpace α}
{mR : MeasurableSpace R}
{Ω : Type u_3}
{mΩ : MeasurableSpace Ω}
[StandardBorelSpace α]
[Nonempty α]
[StandardBorelSpace R]
[Nonempty R]
{alg : Algorithm α R}
{ν : ProbabilityTheory.Kernel α R}
[ProbabilityTheory.IsMarkovKernel ν]
{P : MeasureTheory.Measure Ω}
[MeasureTheory.IsProbabilityMeasure P]
{A : ℕ → Ω → α}
{R' : ℕ → Ω → R}
[StandardBorelSpace Ω]
(h : IsAlgEnvSeq A R' alg (stationaryEnv ν) P)
(n : ℕ)
:
ProbabilityTheory.CondIndepFun (MeasurableSpace.comap (A (n + 1)) inferInstance) ⋯ (R' (n + 1)) (hist A R' n) P
The reward at time n + 1 is conditionally independent of the history up to time n
given the action at time n + 1.
theorem
Learning.IsAlgEnvSeq.condIndepFun_reward_hist_action_action
{α : Type u_1}
{R : Type u_2}
{mα : MeasurableSpace α}
{mR : MeasurableSpace R}
{Ω : Type u_3}
{mΩ : MeasurableSpace Ω}
[StandardBorelSpace α]
[Nonempty α]
[StandardBorelSpace R]
[Nonempty R]
{alg : Algorithm α R}
{ν : ProbabilityTheory.Kernel α R}
[ProbabilityTheory.IsMarkovKernel ν]
{P : MeasureTheory.Measure Ω}
[MeasureTheory.IsProbabilityMeasure P]
{A : ℕ → Ω → α}
{R' : ℕ → Ω → R}
[StandardBorelSpace Ω]
(h : IsAlgEnvSeq A R' alg (stationaryEnv ν) P)
(n : ℕ)
:
ProbabilityTheory.CondIndepFun (MeasurableSpace.comap (A (n + 1)) inferInstance) ⋯ (R' (n + 1))
(fun (ω : Ω) => (hist A R' n ω, A (n + 1) ω)) P
theorem
Learning.IsAlgEnvSeq.condIndepFun_reward_hist_action_action'
{α : Type u_1}
{R : Type u_2}
{mα : MeasurableSpace α}
{mR : MeasurableSpace R}
{Ω : Type u_3}
{mΩ : MeasurableSpace Ω}
[StandardBorelSpace α]
[Nonempty α]
[StandardBorelSpace R]
[Nonempty R]
{alg : Algorithm α R}
{ν : ProbabilityTheory.Kernel α R}
[ProbabilityTheory.IsMarkovKernel ν]
{P : MeasureTheory.Measure Ω}
[MeasureTheory.IsProbabilityMeasure P]
{A : ℕ → Ω → α}
{R' : ℕ → Ω → R}
[StandardBorelSpace Ω]
(h : IsAlgEnvSeq A R' alg (stationaryEnv ν) P)
(n : ℕ)
(hn : n ≠ 0)
:
ProbabilityTheory.CondIndepFun (MeasurableSpace.comap (A n) inferInstance) ⋯ (R' n)
(fun (ω : Ω) => (hist A R' (n - 1) ω, A n ω)) P
theorem
Learning.IT.condDistrib_reward_stationaryEnv
{α : Type u_1}
{R : Type u_2}
{mα : MeasurableSpace α}
{mR : MeasurableSpace R}
[StandardBorelSpace α]
[Nonempty α]
[StandardBorelSpace R]
[Nonempty R]
{alg : Algorithm α R}
{ν : ProbabilityTheory.Kernel α R}
[ProbabilityTheory.IsMarkovKernel ν]
(n : ℕ)
:
⇑𝓛[reward n | action n; trajMeasure alg (stationaryEnv ν)] =ᵐ[MeasureTheory.Measure.map (action n) (trajMeasure alg (stationaryEnv ν))] ⇑ν
The conditional distribution of the reward at time n given the action at time n is ν.
theorem
Learning.IT.condIndepFun_reward_hist_action
{α : Type u_1}
{R : Type u_2}
{mα : MeasurableSpace α}
{mR : MeasurableSpace R}
[StandardBorelSpace α]
[Nonempty α]
[StandardBorelSpace R]
[Nonempty R]
{alg : Algorithm α R}
{ν : ProbabilityTheory.Kernel α R}
[ProbabilityTheory.IsMarkovKernel ν]
(n : ℕ)
:
ProbabilityTheory.CondIndepFun (MeasurableSpace.comap (action (n + 1)) inferInstance) ⋯ (reward (n + 1)) (hist n)
(trajMeasure alg (stationaryEnv ν))
The reward at time n + 1 is conditionally independent of the history up to time n
given the action at time n + 1.
theorem
Learning.IT.condIndepFun_reward_hist_action_action
{α : Type u_1}
{R : Type u_2}
{mα : MeasurableSpace α}
{mR : MeasurableSpace R}
[StandardBorelSpace α]
[Nonempty α]
[StandardBorelSpace R]
[Nonempty R]
{alg : Algorithm α R}
{ν : ProbabilityTheory.Kernel α R}
[ProbabilityTheory.IsMarkovKernel ν]
(n : ℕ)
:
ProbabilityTheory.CondIndepFun (MeasurableSpace.comap (action (n + 1)) inferInstance) ⋯ (reward (n + 1))
(fun (ω : ℕ → α × R) => (hist n ω, action (n + 1) ω)) (trajMeasure alg (stationaryEnv ν))
theorem
Learning.IT.condIndepFun_reward_hist_action_action'
{α : Type u_1}
{R : Type u_2}
{mα : MeasurableSpace α}
{mR : MeasurableSpace R}
[StandardBorelSpace α]
[Nonempty α]
[StandardBorelSpace R]
[Nonempty R]
{alg : Algorithm α R}
{ν : ProbabilityTheory.Kernel α R}
[ProbabilityTheory.IsMarkovKernel ν]
(n : ℕ)
(hn : n ≠ 0)
:
ProbabilityTheory.CondIndepFun (MeasurableSpace.comap (action n) inferInstance) ⋯ (reward n)
(fun (ω : ℕ → α × R) => (hist (n - 1) ω, action n ω)) (trajMeasure alg (stationaryEnv ν))