Imports
import Mathlib

CLRS Section 16.3 - Huffman codes

This section gives a self-contained Lean proof of the optimality of Huffman codes. It is isolated from the legacy CfProofs.Greedy.Huffman.* modules and is arranged as a readable pipeline:

  1. Trees, frequencies, forests, and the Huffman merge algorithm.

  2. Local tree-editing lemmas for swaps, merges, and split leaves.

  3. Preservation lemmas for huffman over a forest.

  4. The exchange/split-leaf theorem.

  5. The bundled-forest V2 optimality theorem and frequency-table interface.

open Listnamespace CLRSnamespace HuffmanV2

Trees, forests, and the Huffman algorithm

inductive HuffTree : Type | htLeaf (symbol freq : ) | htInner (left right : HuffTree) deriving Repr, DecidableEqopen HuffTreedef rootFreq : HuffTree | htLeaf _ f => f | htInner l r => rootFreq l + rootFreq rdef alphabet : HuffTree Finset | htLeaf s _ => {s} | htInner l r => alphabet l alphabet rdef consistent : HuffTree Prop | htLeaf _ _ => True | htInner l r => consistent l consistent r Disjoint (alphabet l) (alphabet r)def height : HuffTree | htLeaf _ _ => 0 | htInner l r => max (height l) (height r) + 1def depthOf (s : ) : HuffTree Option | htLeaf sym _ => if sym = s then some 0 else none | htInner l r => match depthOf s l with | some d => some (d + 1) | none => match depthOf s r with | some d => some (d + 1) | none => nonedef freqOf (s : ) : HuffTree | htLeaf sym f => if sym = s then f else 0 | htInner l r => freqOf s l + freqOf s rdef cost : HuffTree | htLeaf _ _ => 0 | htInner l r => cost l + cost r + rootFreq l + rootFreq rdef nodeCount : HuffTree | htLeaf _ _ => 1 | htInner l r => 1 + nodeCount l + nodeCount rlemma height_eq_zero_iff (t : HuffTree) : height t = 0 s f, t = htLeaf s f := ; cases t with ; def sameFreqs (t u : HuffTree) : Prop := s, freqOf s t = freqOf s udef optimum (t : HuffTree) : Prop := consistent t ( s alphabet t, freqOf s t > 0) u, consistent u sameFreqs t u cost t cost udef unite (t₁ t₂ : HuffTree) : HuffTree := htInner t₁ t₂def insortTree (t : HuffTree) : List HuffTree List HuffTree | [] => [t] | u :: us => if rootFreq t rootFreq u then t :: u :: us else u :: insortTree t us def forest_consistent : List HuffTree Prop | [] => True | [t] => consistent t | t :: ts => consistent t forest_consistent ts ( u ts, Disjoint (alphabet t) (alphabet u))def forest_sorted : List HuffTree Prop | [] => True | [_] => True | t₁ :: t₂ :: ts => rootFreq t₁ rootFreq t₂ forest_sorted (t₂ :: ts)def replaceFreq (sym newFreq : ) : HuffTree HuffTree | htLeaf s f => if s = sym then htLeaf s newFreq else htLeaf s f | htInner l r => htInner (replaceFreq sym newFreq l) (replaceFreq sym newFreq r)def swapFreqs (a c : ) (t : HuffTree) : HuffTree := replaceFreq c (freqOf a t) (replaceFreq a (freqOf c t) t)def splitLeaf (t : HuffTree) (z a b fa fb : ) : HuffTree := match t with | htLeaf sym f => if sym = z then htInner (htLeaf a fa) (htLeaf b fb) else htLeaf sym f | htInner l r => htInner (splitLeaf l z a b fa fb) (splitLeaf r z a b fa fb)def swapLeaves (a b : ) : HuffTree HuffTree | htLeaf s f => if s = a then htLeaf b f else if s = b then htLeaf a f else htLeaf s f | htInner l r => htInner (swapLeaves a b l) (swapLeaves a b r)def mergePair (a b z fz : ) : HuffTree HuffTree | htInner (htLeaf x fx) (htLeaf y fy) => if (x = a y = b) (x = b y = a) then htLeaf z fz else htInner (htLeaf x fx) (htLeaf y fy) | htInner l r => htInner (mergePair a b z fz l) (mergePair a b z fz r) | t => t

Alphabet, frequency, and depth lemmas

theorem mem_alphabet_of_freq_pos (s : ) (t : HuffTree) (h : freqOf s t > 0) : s alphabet t := ; theorem alphabet_replaceFreq (sym newFreq : ) (t : HuffTree) : alphabet (replaceFreq sym newFreq t) = alphabet t := induction t with ; theorem freqOf_replaceFreq_of_ne (sym newFreq other : ) (t : HuffTree) (h_ne : other sym) : freqOf other (replaceFreq sym newFreq t) = freqOf other t := induction t with ; ; ; theorem cost_replaceFreq_eq (sym newFreq : ) (t : HuffTree) (h_cons : consistent t) : (cost (replaceFreq sym newFreq t) : ) = (cost t : ) + ((newFreq : ) - (freqOf sym t : )) * ((depthOf sym t).getD 0 : ) := induction t with ; ; ;

splitLeaf infrastructure

;

Basic swap invariants

;

Frequency and depth behavior under swaps

; ; ; ; ; ; lemma depthOf_replaceFreq_eq (sym newFreq s : ) (t : HuffTree) : depthOf s (replaceFreq sym newFreq t) = depthOf s t := induction t with ;

Consistency and cost behavior under exchanges

def swapSym (a b s : ) : := if s = a then b else if s = b then a else s _ = b := h2 lemma alphabet_swapLeaves_eq_image (a b : ) (t : HuffTree) : alphabet (swapLeaves a b t) = (alphabet t).image (swapSym a b) := induction t with ; ;

Merging sibling leaves

inductive areSiblings (a b : ) : HuffTree Prop | here (fa fb : ) : areSiblings a b (htInner (htLeaf a fa) (htLeaf b fb)) | here' (fa fb : ) : areSiblings a b (htInner (htLeaf b fb) (htLeaf a fa)) | inLeft (l r : HuffTree) : areSiblings a b l areSiblings a b (htInner l r) | inRight (l r : HuffTree) : areSiblings a b r areSiblings a b (htInner l r) ; lemma areSiblings_mem_alphabet {a b : } {t : HuffTree} (h_sib : areSiblings a b t) : a alphabet t b alphabet t := induction h_sib with ; cases r with ; cases r with ; ; ; ; ; ; ; cases r with ; ; ; ; ; ; ; ; ; ; ; ; ; ;

Commuting split leaves through Huffman merging

The commutation proof only needs rootFreq (splitLeaf t) = rootFreq t for each forest tree.

lemma insortTree_length (t : HuffTree) (ts : List HuffTree) : (insortTree t ts).length = ts.length + 1 := induction ts with ; ; @[simp] lemma splitLeaf_unite (l r : HuffTree) (z a b fa fb : ) : splitLeaf (unite l r) z a b fa fb = unite (splitLeaf l z a b fa fb) (splitLeaf r z a b fa fb) := @[simp] lemma rootFreq_unite (t1 t2 : HuffTree) : rootFreq (unite t1 t2) = rootFreq t1 + rootFreq t2 :=

insortTree commutation with map splitLeaf

insortTree membership

lemma mem_insortTree (t : HuffTree) (ts : List HuffTree) (u : HuffTree) : u insortTree t ts u = t u ts := induction ts generalizing t with lemma forall_mem_insortTree {P : HuffTree Prop} {U : HuffTree} {ts : List HuffTree} (hU : P U) (hts : t ts, P t) : t insortTree U ts, P t :=

General commutation theorem

Special case for optimum_huffman

Given a forest rest where s1 appears only as htLeaf s1 (f1+f2) (the combined leaf), we have the commutation:

splitLeaf (huffman (insortTree (htLeaf s1 (f1+f2)) rest)) s1 s1 s2 f1 f2 = huffman (insortTree (htInner (htLeaf s1 f1) (htLeaf s2 f2)) rest)

Requires: s1 ∉ alphabet t for all t ∈ rest, so that splitLeaf does nothing on rest.

Preservation of forest frequencies and alphabets

huffman preserves the aggregate frequencies and alphabet of a nonempty forest.

def forest_freq (ts : List HuffTree) (s : ) : := (ts.map (freqOf s)).sumdef forest_alphabet : List HuffTree Finset | [] => | t :: ts => alphabet t forest_alphabet tslemma mem_forest_alphabet (ts : List HuffTree) (s : ) : s forest_alphabet ts t ts, s alphabet t := induction ts with lemma forest_freq_cons (t : HuffTree) (ts : List HuffTree) (s : ) : forest_freq (t :: ts) s = freqOf s t + forest_freq ts s := lemma forest_freq_insortTree (t : HuffTree) (ts : List HuffTree) (s : ) : forest_freq (insortTree t ts) s = forest_freq (t :: ts) s := induction ts generalizing t with lemma forest_alphabet_insortTree (t : HuffTree) (ts : List HuffTree) : forest_alphabet (insortTree t ts) = forest_alphabet (t :: ts) := induction ts generalizing t with ; lemma freqOf_huffman_eq_forest_freq (ts : List HuffTree) (s : ) (h_nonempty : ts []) : freqOf s (huffman ts) = forest_freq ts s := induction ts using huffman.induct with lemma alphabet_huffman_eq_forest_alphabet (ts : List HuffTree) (h_nonempty : ts []) : alphabet (huffman ts) = forest_alphabet ts := induction ts using huffman.induct with lemma forest_consistent_cons_iff (t : HuffTree) (ts : List HuffTree) : forest_consistent (t :: ts) consistent t forest_consistent ts u ts, Disjoint (alphabet t) (alphabet u) := cases ts with ; ; lemma forest_freq_eq_zero_of_not_mem (ts : List HuffTree) (s : ) (h : s forest_alphabet ts) : forest_freq ts s = 0 := lemma forest_sorted_tail (t : HuffTree) (ts : List HuffTree) (h_sorted : forest_sorted (t :: ts)) : forest_sorted ts := cases ts with cases h_r : insortTree t us with exact Disjoint.symm (h_parts.2.2 u ( ))

Core exchange and split-leaf optimality theorem

def deepestSiblingPair (t : HuffTree) : × := match t with | htLeaf s _ => (s, s) | htInner l r => match l, r with | htLeaf x _, htLeaf y _ => (x, y) | htLeaf _ _, _ => deepestSiblingPair r | _, htLeaf _ _ => deepestSiblingPair l | _, _ => if height l height r then deepestSiblingPair l else deepestSiblingPair r; ; lemma depthOf_getD_inner_of_mem_left {s : } {l r : HuffTree} (h : s alphabet l) : (depthOf s (htInner l r)).getD 0 = (depthOf s l).getD 0 + 1 := lemma depthOf_getD_inner_of_mem_right {s : } {l r : HuffTree} (h : s alphabet r) (h_not : s alphabet l) : (depthOf s (htInner l r)).getD 0 = (depthOf s r).getD 0 + 1 := lemma areSiblings_exchangeLeft (t : HuffTree) (a x y : ) (h_sib : areSiblings x y t) (h_ne_ax : a x) (h_ne_ay : a y) (h_ne_xy : x y) : areSiblings a y (swapLeaves a x t) := induction h_sib with <;> simp <;> simp lemma areSiblings_replaceFreq (t : HuffTree) (a b sym freq : ) (h_sib : areSiblings a b t) : areSiblings a b (replaceFreq sym freq t) := induction h_sib with ; ; ; ; lemma areSiblings_swapFreqs_preserved (t : HuffTree) (a b x y : ) (h_sib : areSiblings a b t) : areSiblings a b (swapFreqs x y t) := lemma areSiblings_ne (t : HuffTree) (a b : ) (h_cons : consistent t) (h_sib : areSiblings a b t) : a b := induction h_sib with ; ; lemma optimum_leaf (s f : ) (h_f_pos : f > 0) : optimum (htLeaf s f) := refine , ?_, ?_ ; private lemma areSiblings_swap_siblings {a b : } {t : HuffTree} (h_sib : areSiblings a b t) (h_ne : a b) : areSiblings b a (swapLeaves a b t) := induction h_sib with ; ; lemma areSiblings_exchangeRight (t : HuffTree) (a b x : ) (h_sib : areSiblings a x t) (h_ne_ba : b a) (h_ne_bx : b x) (h_ne_ax : a x) : areSiblings a b (swapLeaves b x t) := induction h_sib with <;> simp <;> simp ; ; cases r with ; private lemma mem_alphabet_splitLeaf_of_ne (t : HuffTree) (z b fa fb x : ) (hx_ne_z : x z) (hx_ne_b : x b) : x alphabet (splitLeaf t z z b fa fb) x alphabet t := induction t with ; private lemma freqOf_splitLeaf_of_ne (t : HuffTree) (z b fa fb x : ) (hx_ne_z : x z) (hx_ne_b : x b) : freqOf x (splitLeaf t z z b fa fb) = freqOf x t := induction t with ; apply ihr hcr hz_r ( ; ; ) hz_ne_b apply ihr hcr hz_r ( ; ; ) hz_ne_b structure SplitFreqCandidate (t base tree : HuffTree) (z b fa fb : ) where cons : consistent tree freq_rel : s, freqOf s tree = freqOf s (splitLeaf t z z b fa fb) cost_le : (cost tree : ) (cost base : )namespace SplitFreqCandidatetheorem ofBase {t base : HuffTree} {z b fa fb : } (h_cons : consistent base) (h_freq_rel : s, freqOf s base = freqOf s (splitLeaf t z z b fa fb)) : SplitFreqCandidate t base base z b fa fb := { cons := h_cons freq_rel := h_freq_rel cost_le := le_refl _ } cost_le := h_cost_le } end SplitFreqCandidatestructure SplitMergeCandidate (t base : HuffTree) (z b fa fb : ) where tree : HuffTree freq : SplitFreqCandidate t base tree z b fa fb sibling : areSiblings z b tree exact_mod_cast (show (cost t : ) + (fa : ) + (fb : ) (cost v : ) from )namespace SplitMergeCandidatedef ofBase {t base : HuffTree} {z b fa fb : } (h_sib : areSiblings z b base) (h_cons : consistent base) (h_freq_rel : s, freqOf s base = freqOf s (splitLeaf t z z b fa fb)) : SplitMergeCandidate t base z b fa fb where tree := base freq := SplitFreqCandidate.ofBase h_cons h_freq_rel sibling := h_sibdef ofExchange {t base w : HuffTree} {z b fa fb a x : } (h_ne : a x) (ha_in : a alphabet w) (hx_in : x alphabet w) (h_cons_w : consistent w) (h_freq_rel_w : s, freqOf s w = freqOf s (splitLeaf t z z b fa fb)) (h_sib : areSiblings z b (swapFreqs a x (swapLeaves a x w))) (h_cost_le : (cost (swapFreqs a x (swapLeaves a x w)) : ) (cost base : )) : SplitMergeCandidate t base z b fa fb where tree := swapFreqs a x (swapLeaves a x w) freq := SplitFreqCandidate.ofExchange h_ne ha_in hx_in h_cons_w h_freq_rel_w h_cost_le sibling := h_sibdef ofFreqCandidate {t base tree : HuffTree} {z b fa fb : } (C : SplitFreqCandidate t base tree z b fa fb) (h_sib : areSiblings z b tree) : SplitMergeCandidate t base z b fa fb where tree := tree freq := C sibling := h_sibtheorem cost_bound {t base : HuffTree} {z b fa fb : } (C : SplitMergeCandidate t base z b fa fb) (h_opt_t : u, consistent u sameFreqs t u cost t cost u) (h_cons_t : consistent t) (h_z_in : z alphabet t) (hb_not_mem : b alphabet t) (hz_ne_b : z b) (hb_fb_t : freqOf b t = 0) (h_sum : freqOf z t = fa + fb) : cost t + fa + fb cost base := merge_split_siblings_cost_bound t base C.tree z b fa fb h_opt_t hb_fb_t hz_ne_b h_sum C.sibling C.freq.cons (C.freq.freq_left h_cons_t h_z_in hb_not_mem hz_ne_b) (C.freq.freq_right h_cons_t h_z_in hb_not_mem hz_ne_b) C.freq.freq_rel C.freq.cost_leend SplitMergeCandidatenamespace SplitFreqCandidatedef exchangeToMerge {t base tree : HuffTree} {z b fa fb a x : } (C : SplitFreqCandidate t base tree z b fa fb) (h_ne : a x) (ha_in : a alphabet tree) (hx_in : x alphabet tree) (h_sib : areSiblings z b (swapFreqs a x (swapLeaves a x tree))) (h_cost_step : (cost (swapFreqs a x (swapLeaves a x tree)) : ) (cost tree : )) : SplitMergeCandidate t base z b fa fb := SplitMergeCandidate.ofFreqCandidate (SplitFreqCandidate.ofExchange (t := t) (base := base) (w := tree) (z := z) (b := b) (fa := fa) (fb := fb) (a := a) (x := x) h_ne ha_in hx_in C.cons C.freq_rel (le_trans h_cost_step C.cost_le)) h_sibend SplitFreqCandidate

Split-leaf optimality interface

The long exchange proof above is intentionally hidden behind the following small certificate. The rest of the Huffman proof only needs to know that a merged symbol z can be split into two positive minimum-frequency symbols z and b, with the old frequency of z equal to their sum.

structure SplitLeafOptimalitySpec (t : HuffTree) (z b fa fb : ) where z_mem : z alphabet t b_fresh : b alphabet t sym_ne : z b fa_pos : fa > 0 fb_pos : fb > 0 fa_le_fb : fa fb fa_min : s alphabet t, fa freqOf s t fb_min : s alphabet t, s z fb freqOf s t freq_z : freqOf z t = fa + fbtheorem split_leaf_preserves_optimum {t : HuffTree} {z b fa fb : } (S : SplitLeafOptimalitySpec t z b fa fb) (h_opt : optimum t) : optimum (splitLeaf t z z b fa fb) := optimum_splitLeaf t z b fa fb h_opt S.z_mem S.b_fresh S.sym_ne S.fa_pos S.fb_pos S.fa_le_fb S.fa_min S.fb_min S.freq_z

Bundled forest invariant and greedy-step certificate

structure Forest where trees : List HuffTree sorted : forest_sorted trees consistent : forest_consistent trees allLeaves : t trees, height t = 0 allPos : t trees, rootFreq t > 0 nonempty : trees []namespace Forestdef length (F : Forest) : := F.trees.length

Greedy step on a raw list that already satisfies the forest invariant.

exact forall_mem_insortTree (P := λ t => rootFreq t > 0) h_pos_new (fun t ht => h_pos t ( )), nonempty := insortTree_ne_nil tNew rest }

The bundled greedy reduction step.

def mergeCheapest (F : Forest) (h : F.length 2) : Forest := mergeCheapestList F.trees F.sorted F.consistent F.allLeaves F.allPos ( )

Specification of mergeCheapest

lemma mergeCheapestList_spec (ts : List HuffTree) (h_sorted : forest_sorted ts) (h_cons : forest_consistent ts) (h_leaves : t ts, height t = 0) (h_pos : t ts, rootFreq t > 0) (h : ts.length 2) : sa fa sb fb rest, ts = htLeaf sa fa :: htLeaf sb fb :: rest (mergeCheapestList ts h_sorted h_cons h_leaves h_pos h).trees = insortTree (htLeaf sa (fa + fb)) rest := cases ts with cases ts1 with rcases height_eq_zero_iff t1 |>.mp (h_leaves t1 ( )) with sa, fa, h1' rcases height_eq_zero_iff t2 |>.mp (h_leaves t2 ( )) with sb, fb, h2' lemma mergeCheapest_length_lt (F : Forest) (h : F.length 2) : (F.mergeCheapest h).length < F.length :=

Named certificate for the facts produced by one bundled greedy step. It packages the exact hypotheses needed by optimum_splitLeaf plus the final commutation equality back to the original forest.

structure MergeCheapestSplitReady (F : Forest) (h : F.length 2) where sa : fa : sb : fb : fa_pos : fa > 0 fb_pos : fb > 0 sym_ne : sa sb sa_mem : sa alphabet (huffman (F.mergeCheapest h).trees) sb_not_mem : sb alphabet (huffman (F.mergeCheapest h).trees) freq_sa : freqOf sa (huffman (F.mergeCheapest h).trees) = fa + fb fa_le_fb : fa fb min_fa : s alphabet (huffman (F.mergeCheapest h).trees), fa freqOf s (huffman (F.mergeCheapest h).trees) min_fb : s alphabet (huffman (F.mergeCheapest h).trees), s sa fb freqOf s (huffman (F.mergeCheapest h).trees) split_commute : splitLeaf (huffman (F.mergeCheapest h).trees) sa sa sb fa fb = huffman F.trees

The bundled greedy step exposes exactly the side conditions needed to split the merged leaf after the recursive Huffman call. This is the main V2 interface: callers do not inspect the forest plumbing directly.

namespace MergeCheapestSplitReadytheorem splitLeafSpec {F : Forest} {h : F.length 2} (R : MergeCheapestSplitReady F h) : SplitLeafOptimalitySpec (huffman (F.mergeCheapest h).trees) R.sa R.sb R.fa R.fb := { z_mem := R.sa_mem b_fresh := R.sb_not_mem sym_ne := R.sym_ne fa_pos := R.fa_pos fb_pos := R.fb_pos fa_le_fb := R.fa_le_fb fa_min := R.min_fa fb_min := R.min_fb freq_z := R.freq_sa } end MergeCheapestSplitReadytheorem mergeCheapest_preserves_optimum (F : Forest) (h : F.length 2) (h_opt_reduced : optimum (huffman (F.mergeCheapest h).trees)) : optimum (huffman F.trees) := end Forestnamespace OptimalV2

Huffman's algorithm produces an optimum tree for every valid bundled forest.

)
end OptimalV2

Frequency-table interface

The frequency assigned to a symbol by a raw frequency table.

def tableFreq (xs : List ( × )) (s : ) : := (xs.map (fun p => if p.1 = s then p.2 else 0)).sum

Turn a frequency table into singleton Huffman leaves.

def leavesOfFreqs (xs : List ( × )) : List HuffTree := xs.map (fun p => htLeaf p.1 p.2)

Insertion-sort a forest by rootFreq, using the same order expected by huffman.

def sortForest : List HuffTree List HuffTree | [] => [] | t :: ts => insortTree t (sortForest ts)

Run Huffman's algorithm on an arbitrary frequency table.

def huffmanOfFreqs (xs : List ( × )) : HuffTree := huffman (sortForest (leavesOfFreqs xs))
lemma sortForest_sorted (ts : List HuffTree) : forest_sorted (sortForest ts) := induction ts with lemma sortForest_ne_nil {ts : List HuffTree} (h_nonempty : ts []) : sortForest ts [] := cases ts with lemma mem_sortForest (t : HuffTree) (ts : List HuffTree) : t sortForest ts t ts := induction ts with ); lemma forest_freq_leavesOfFreqs (xs : List ( × )) (s : ) : forest_freq (leavesOfFreqs xs) s = tableFreq xs s := induction xs with refine , ih h_tail_nodup, ?_ exact sortForest_ne_nil ( )

Huffman's algorithm is optimal for every nonempty positive frequency table with no duplicate symbols. The table may be in any order; it is sorted before calling the bundled V2 huffman optimality theorem.

theorem optimum_huffman_freqs (xs : List ( × )) (h_nodup : (xs.map Prod.fst).Nodup) (h_pos : p xs, p.2 > 0) (h_nonempty : xs []) : optimum (huffmanOfFreqs xs) := exact OptimalV2.optimum_huffman_v2 { trees := sortForest (leavesOfFreqs xs) sorted := sortForest_sorted (leavesOfFreqs xs) consistent := forest_consistent_sortForest (leavesOfFreqs xs) (forest_consistent_leavesOfFreqs xs h_nodup) allLeaves := fun t ht => rcases ( ) with a, b, _, rfl allPos := fun t ht => rcases ( ) with a, b, hp, rfl nonempty := sortForest_ne_nil ( ) }

Reader-facing correctness theorem for the frequency-table interface: Huffman preserves the input frequencies and returns an optimum tree for those frequencies.

theorem huffmanOfFreqs_correct (xs : List ( × )) (h_nodup : (xs.map Prod.fst).Nodup) (h_pos : p xs, p.2 > 0) (h_nonempty : xs []) : ( s, freqOf s (huffmanOfFreqs xs) = tableFreq xs s) optimum (huffmanOfFreqs xs) :=

Direct minimum-cost form of Huffman optimality for frequency tables. Any consistent tree with the same frequencies has cost at least the Huffman tree's cost.

)
end HuffmanV2end CLRS