Bandit #

structure Learning.Algorithm (α : Type u_3) (R : Type u_4) [MeasurableSpace α] [MeasurableSpace R] :

Type (max u_3 u_4)

A stochastic, sequential algorithm.

policy (n : ℕ) : ProbabilityTheory.Kernel ({ x : ℕ // x ∈ Finset.Iic n } → α × R) α
Policy or sampling rule: distribution of the next action.
h_policy (n : ℕ) : ProbabilityTheory.IsMarkovKernel (self.policy n)
p0 : MeasureTheory.Measure α
Distribution of the first action.
hp0 : MeasureTheory.IsProbabilityMeasure self.p0

Instances For

instance Learning.instIsMarkovKernelForallSubtypeNatMemFinsetIicProdPolicy {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (alg : Algorithm α R) (n : ℕ) :

ProbabilityTheory.IsMarkovKernel (alg.policy n)

source

instance Learning.instIsProbabilityMeasureP0 {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (alg : Algorithm α R) :

MeasureTheory.IsProbabilityMeasure alg.p0

source

structure Learning.Environment (α : Type u_3) (R : Type u_4) [MeasurableSpace α] [MeasurableSpace R] :

Type (max u_3 u_4)

A stochastic environment.

feedback (n : ℕ) : ProbabilityTheory.Kernel (({ x : ℕ // x ∈ Finset.Iic n } → α × R) × α) R
Distribution of the next observation as function of the past history.
h_feedback (n : ℕ) : ProbabilityTheory.IsMarkovKernel (self.feedback n)
ν0 : ProbabilityTheory.Kernel α R
Distribution of the first observation given the first action.
hp0 : ProbabilityTheory.IsMarkovKernel self.ν0

Instances For

source

instance Learning.instIsMarkovKernelProdForallSubtypeNatMemFinsetIicFeedback {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (env : Environment α R) (n : ℕ) :

ProbabilityTheory.IsMarkovKernel (env.feedback n)

source

instance Learning.instIsMarkovKernelν0 {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (env : Environment α R) :

ProbabilityTheory.IsMarkovKernel env.ν0

source

noncomputable def Learning.stepKernel {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (alg : Algorithm α R) (env : Environment α R) (n : ℕ) :

ProbabilityTheory.Kernel ({ x : ℕ // x ∈ Finset.Iic n } → α × R) (α × R)

Kernel describing the distribution of the next action-reward pair given the history up to n.

Equations

Learning.stepKernel alg env n = (alg.policy n).compProd (env.feedback n)

Instances For

source

instance Learning.instIsMarkovKernelForallSubtypeNatMemFinsetIicProdStepKernel {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (alg : Algorithm α R) (env : Environment α R) (n : ℕ) :

ProbabilityTheory.IsMarkovKernel (stepKernel alg env n)

Equations

⋯ = ⋯

source

@[simp]

theorem Learning.fst_stepKernel {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (alg : Algorithm α R) (env : Environment α R) (n : ℕ) :

(stepKernel alg env n).fst = alg.policy n

source

noncomputable def Learning.traj {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (alg : Algorithm α R) (env : Environment α R) (n : ℕ) :

ProbabilityTheory.Kernel ({ x : ℕ // x ∈ Finset.Iic n } → α × R) (ℕ → α × R)

Kernel sending a partial trajectory of the bandit interaction Iic n → α × ℝ to a measure on ℕ → α × ℝ, supported on full trajectories that start with the partial one.

Equations

Learning.traj alg env n = ProbabilityTheory.Kernel.traj (Learning.stepKernel alg env) n

Instances For

source

instance Learning.instIsMarkovKernelForallSubtypeNatMemFinsetIicProdForallTraj {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (alg : Algorithm α R) (env : Environment α R) (n : ℕ) :

ProbabilityTheory.IsMarkovKernel (traj alg env n)

Equations

⋯ = ⋯

source

noncomputable def Learning.trajMeasure {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (alg : Algorithm α R) (env : Environment α R) :

MeasureTheory.Measure (ℕ → α × R)

Measure on the sequence of actions and observations generated by the algorithm/environment.

Equations

Learning.trajMeasure alg env = ProbabilityTheory.Kernel.trajMeasure (alg.p0.compProd env.ν0) (Learning.stepKernel alg env)

Instances For

source

instance Learning.instIsProbabilityMeasureForallNatProdTrajMeasure {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (alg : Algorithm α R) (env : Environment α R) :

MeasureTheory.IsProbabilityMeasure (trajMeasure alg env)

Equations

⋯ = ⋯

source

def Learning.step {α : Type u_1} {R : Type u_2} (n : ℕ) (h : ℕ → α × R) :

α × R

Action and reward at step n.

Equations

Learning.step n h = h n

Instances For

source

def Learning.action {α : Type u_1} {R : Type u_2} (n : ℕ) (h : ℕ → α × R) :

action n is the action pulled at time n. This is a random variable on the measurable space ℕ → α × ℝ.

Equations

Learning.action n h = (h n).1

Instances For

source

def Learning.reward {α : Type u_1} {R : Type u_2} (n : ℕ) (h : ℕ → α × R) :

reward n is the reward at time n. This is a random variable on the measurable space ℕ → α × R.

Equations

Learning.reward n h = (h n).2

Instances For

source

def Learning.hist {α : Type u_1} {R : Type u_2} (n : ℕ) (h : ℕ → α × R) :

{ x : ℕ // x ∈ Finset.Iic n } → α × R

hist n is the history up to time n. This is a random variable on the measurable space ℕ → α × R.

Equations

Learning.hist n h i = h ↑i

Instances For

source

theorem Learning.fst_comp_step {α : Type u_1} {R : Type u_2} (n : ℕ) :

Prod.fst ∘ step n = action n

source

theorem Learning.measurable_step {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (n : ℕ) :

Measurable (step n)

source

theorem Learning.measurable_step_prod {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} :

Measurable fun (p : ℕ × (ℕ → α × R)) => step p.1 p.2

source

theorem Learning.measurable_action {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (n : ℕ) :

Measurable (action n)

source

theorem Learning.measurable_action_prod {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} :

Measurable fun (p : ℕ × (ℕ → α × R)) => action p.1 p.2

source

theorem Learning.measurable_reward {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (n : ℕ) :

Measurable (reward n)

source

theorem Learning.measurable_reward_prod {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} :

Measurable fun (p : ℕ × (ℕ → α × R)) => reward p.1 p.2

source

theorem Learning.measurable_hist {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (n : ℕ) :

Measurable (hist n)

source

theorem Learning.hist_eq_frestrictLe {α : Type u_1} {R : Type u_2} :

hist = Preorder.frestrictLe

source

def Learning.filtration (α : Type u_3) (R : Type u_4) [MeasurableSpace α] [MeasurableSpace R] :

MeasureTheory.Filtration ℕ inferInstance

Filtration of the algorithm interaction.

Equations

Learning.filtration α R = MeasureTheory.Filtration.piLE

Instances For

source

theorem Learning.condDistrib_step {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} [StandardBorelSpace α] [Nonempty α] [StandardBorelSpace R] [Nonempty R] (alg : Algorithm α R) (env : Environment α R) (n : ℕ) :

⇑(ProbabilityTheory.condDistrib (step (n + 1)) (hist n) (trajMeasure alg env)) =ᵐ[MeasureTheory.Measure.map (hist n) (trajMeasure alg env)] ⇑(stepKernel alg env n)

source

theorem Learning.condDistrib_action {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} [StandardBorelSpace α] [Nonempty α] [StandardBorelSpace R] [Nonempty R] (alg : Algorithm α R) (env : Environment α R) (n : ℕ) :

⇑(ProbabilityTheory.condDistrib (action (n + 1)) (hist n) (trajMeasure alg env)) =ᵐ[MeasureTheory.Measure.map (hist n) (trajMeasure alg env)] ⇑(alg.policy n)

source

theorem Learning.condDistrib_reward {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} [StandardBorelSpace α] [Nonempty α] [StandardBorelSpace R] [Nonempty R] (alg : Algorithm α R) (env : Environment α R) (n : ℕ) :

⇑(ProbabilityTheory.condDistrib (reward (n + 1)) (fun (ω : ℕ → α × R) => (hist n ω, action (n + 1) ω)) (trajMeasure alg env)) =ᵐ[MeasureTheory.Measure.map (fun (ω : ℕ → α × R) => (hist n ω, action (n + 1) ω)) (trajMeasure alg env)] ⇑(env.feedback n)

source

theorem Learning.hasLaw_step_zero {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (alg : Algorithm α R) (env : Environment α R) :

ProbabilityTheory.HasLaw (step 0) (alg.p0.compProd env.ν0) (trajMeasure alg env)

source

theorem Learning.hasLaw_action_zero {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (alg : Algorithm α R) (env : Environment α R) :

ProbabilityTheory.HasLaw (action 0) alg.p0 (trajMeasure alg env)

source

theorem Learning.condDistrib_reward_zero {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} [StandardBorelSpace R] [Nonempty R] (alg : Algorithm α R) (env : Environment α R) :

⇑(ProbabilityTheory.condDistrib (reward 0) (action 0) (trajMeasure alg env)) =ᵐ[MeasureTheory.Measure.map (action 0) (trajMeasure alg env)] ⇑env.ν0

source

noncomputable def Learning.detAlgorithm {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (nextaction : (n : ℕ) → ({ x : ℕ // x ∈ Finset.Iic n } → α × R) → α) (h_next : ∀ (n : ℕ), Measurable (nextaction n)) (action0 : α) :

Algorithm α R

A deterministic algorithm.

Equations

One or more equations did not get rendered due to their size.

Instances For

source

@[simp]

theorem Learning.detAlgorithm_policy {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (nextaction : (n : ℕ) → ({ x : ℕ // x ∈ Finset.Iic n } → α × R) → α) (h_next : ∀ (n : ℕ), Measurable (nextaction n)) (action0 : α) (n : ℕ) :

(detAlgorithm nextaction h_next action0).policy n = ProbabilityTheory.Kernel.deterministic (nextaction n) ⋯

source

@[simp]

theorem Learning.detAlgorithm_p0 {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (nextaction : (n : ℕ) → ({ x : ℕ // x ∈ Finset.Iic n } → α × R) → α) (h_next : ∀ (n : ℕ), Measurable (nextaction n)) (action0 : α) :

(detAlgorithm nextaction h_next action0).p0 = MeasureTheory.Measure.dirac action0

source

theorem Learning.HasLaw_action_zero_detAlgorithm {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} {nextaction : (n : ℕ) → ({ x : ℕ // x ∈ Finset.Iic n } → α × R) → α} {h_next : ∀ (n : ℕ), Measurable (nextaction n)} {action0 : α} {env : Environment α R} :

ProbabilityTheory.HasLaw (action 0) (MeasureTheory.Measure.dirac action0) (trajMeasure (detAlgorithm nextaction h_next action0) env)

source

theorem Learning.action_zero_detAlgorithm {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} {nextaction : (n : ℕ) → ({ x : ℕ // x ∈ Finset.Iic n } → α × R) → α} {h_next : ∀ (n : ℕ), Measurable (nextaction n)} {action0 : α} {env : Environment α R} [MeasurableSingletonClass α] :

action 0 =ᵐ[trajMeasure (detAlgorithm nextaction h_next action0) env] fun (x : ℕ → α × R) => action0

source

theorem Learning.action_eq_eval_comp_hist {α : Type u_1} {R : Type u_2} (n : ℕ) :

action n = (fun (x : { x : ℕ // x ∈ Finset.Iic n } → α × R) => (x ⟨n, ⋯⟩).1) ∘ hist n

source

theorem Learning.reward_eq_eval_comp_hist {α : Type u_1} {R : Type u_2} (n : ℕ) :

reward n = (fun (x : { x : ℕ // x ∈ Finset.Iic n } → α × R) => (x ⟨n, ⋯⟩).2) ∘ hist n

source

theorem Learning.measurable_hist_filtration {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (n : ℕ) :

Measurable (hist n)

source

theorem Learning.measurable_action_filtration {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (n : ℕ) :

Measurable (action n)

source

theorem Learning.adapted_action {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} [TopologicalSpace α] [TopologicalSpace.PseudoMetrizableSpace α] [SecondCountableTopology α] [OpensMeasurableSpace α] :

MeasureTheory.Adapted (Learning.filtration α R) action

source

theorem Learning.measurable_reward_filtration {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} (n : ℕ) :

Measurable (reward n)

source

theorem Learning.adapted_reward {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} [TopologicalSpace R] [TopologicalSpace.PseudoMetrizableSpace R] [SecondCountableTopology R] [OpensMeasurableSpace R] :

MeasureTheory.Adapted (Learning.filtration α R) reward

source

theorem Learning.action_detAlgorithm_ae_eq {α : Type u_1} {R : Type u_2} {mα : MeasurableSpace α} {mR : MeasurableSpace R} {nextaction : (n : ℕ) → ({ x : ℕ // x ∈ Finset.Iic n } → α × R) → α} {h_next : ∀ (n : ℕ), Measurable (nextaction n)} {action0 : α} {env : Environment α R} [StandardBorelSpace α] [Nonempty α] [StandardBorelSpace R] [Nonempty R] (n : ℕ) :

action (n + 1) =ᵐ[trajMeasure (detAlgorithm nextaction h_next action0) env] fun (h : ℕ → α × R) => nextaction n fun (i : { x : ℕ // x ∈ Finset.Iic n }) => h ↑i

source