Implementating Huffman algorithm is a good practice to progress in algorithms and is a good opportunity to train on working methods such as TDD.

This article is based on the gist I wrote to train me.

Huffman compression method can be decomposed in 3 steps:

  • computing a dictionary with frequency of occurrences
  • building a tree representing frequencies and associated path
  • storage of tree and path of each occurrence bit per bit

In next examples, we will work on the string “aaabbffppppeee kk aa”

Computing frequencies

At first, we can imagine a dictionary of char * int containing each char frequency. It don’t like to work with chars when I implement binary data traitment algorithms. So, I created an empty function getting (byte * int) list

let getFrequencies (l:byte list) : (byte*int) list =
  failwith "not implemented"

Then I wrote a test I executed each time I could compile:

[<Test>]
member ``when is``.``computing frequencies`` () =
  let f =
    "aaabbffppppeee kk aa"
    |> strToBytes
    |> getFrequencies
    // converting bytes to chars to make tests writing easier
    |> List.map (fun (b,c) -> char b , c)
    |> dict
  Assert.AreEqual(f.Item 'a', 5)
  Assert.AreEqual(f.Item 'b', 2)
  Assert.AreEqual(f.Item 'f', 2)
  Assert.AreEqual(f.Item 'p', 4)
  Assert.AreEqual(f.Item 'e', 3)
  Assert.AreEqual(f.Item 'k', 2)

A simple function to work with bytes list could be:

let getFrequencies (l:byte list) =
  l |> Seq.groupBy(fun c -> c)
  |> Seq.map (fun (c,l) -> c, (List.ofSeq l).Length)
  |> Seq.toList

I did want to try compression on real files, so I couldn’t load entire content in a byte list. Next function is computing frequencies from a seekable stream.

let getFrequencies' (stream:Stream) =
  let rec loop (acc:(byte*int) list) =
    let i = stream.ReadByte()
    if i < 0
    then acc
    else
      let b = byte i
      match acc |> List.tryFind(fun (v,_) -> v = b) with
      | Some (_,c) ->
          acc
          |> List.filter(fun (v,_) -> v <> b)
          |> List.append [(b, (c+1))]
      | None -> acc |> List.append [(b, 1)]
      |> loop
  stream.Position <- 0L
  let f = loop []
  stream.Position <- 0L
  f

Building the tree

My tree model is simply:

type bit = bool
type path = bit list
type BinaryTreeNode =
  | Leaf of byte * frequency:int
  | Branch of left:BinaryTreeNode option
              * right:BinaryTreeNode option
              * frequency:int
  member __.Switch(b:bit) =
    match __ with
    | Branch (_, right, _) when b -> right
    | Branch (left, _, _) when not b-> left
    | _ -> None
  static member Empty = Leaf(0uy,0)
  member private __.cost =
    lazy
      match __ with
      | Leaf (_, f) -> f
      | Branch(_,_,f) -> f
  member __.Cost() = __.cost.Value

type BinaryTree (root:BinaryTreeNode) =
  member __.Root with get() = root
  member __.GetPath (data:byte) =
    let rec scan (node:BinaryTreeNode) (d:byte) (p:path) =
      let scanBranch (left:BinaryTreeNode option) (right:BinaryTreeNode option) (cp:path) =
        let scanChildren parent v =
          match parent with
          | Some(Leaf(l,f)) -> scan (Leaf(l,f)) d (cp @ [v])
          | Some n -> scan n d (cp @ [v])
          | None -> None
        scanChildren left false
        |> function
           | Some r -> Some r
           | None -> scanChildren right true
      match node with
      | Leaf (b,_) when b = data -> Some p
      | Leaf _ -> None
      | Branch (left, right, _) -> scanBranch left right p
    scan root data []
  member __.GetByte(p:path) =
    let rec loop (bits:path) (current:BinaryTreeNode) =
      match bits, current with
      | true :: tail, Branch(_, Some r, _) -> loop tail r
      | false :: tail, Branch(Some l,_, _) -> loop tail l
      | _, Leaf(d, _) -> d
      | _ -> failwith "invalid path"
    loop p root

The cost is is representing total occurencies count contained by child branches or leaf. The swith method helps to browse sub branches of a tree node.

When the model is written, we can write tests:

[<Test>]
member __.``checking path consistence`` () =
  let tree1 =
    "aaabbffppppeee kk aa"
    |> strToBytes
    |> getFrequencies
    |> buildTree
  let pa = tree1.GetPath(byte 'a')
  let pb = tree1.GetPath(byte 'b')
  let pf = tree1.GetPath(byte 'f')
  let pp = tree1.GetPath(byte 'p')
  let pe = tree1.GetPath(byte 'e')
  let pk = tree1.GetPath(byte 'k')
  Assert.AreEqual(byte 'a', tree1.GetByte(pa.Value))
  Assert.AreEqual(byte 'b', tree1.GetByte(pb.Value))
  Assert.AreEqual(byte 'f', tree1.GetByte(pf.Value))
  Assert.AreEqual(byte 'p', tree1.GetByte(pp.Value))
  Assert.AreEqual(byte 'e', tree1.GetByte(pe.Value))
  Assert.AreEqual(byte 'k', tree1.GetByte(pk.Value))

After tests writting, we can implement a function populating the tree from frequencies:

let buildTree (frequencies: (byte*int) list) =
  let sort (tree:BinaryTreeNode list) =
    tree |> List.sortBy (fun i -> i.Cost())
  let rec loop (tree:BinaryTreeNode list) =
    match sort tree with
    | left::right::[] ->
      Branch(Some left, Some right, left.Cost() + right.Cost())
    | left::right::tail ->
      let branch = Branch(Some left, Some right, left.Cost() + right.Cost())
      loop (branch :: tail)
    | [single] -> single
    | [] -> failwith "invalid operation"
  frequencies
  |> Seq.map Leaf
  |> List.ofSeq
  |> loop
  |> BinaryTree

Calculated paths are:

occurrency path option binary frequency
a Some({[True;False]}) 10 5
p Some({[False; True]}) 01 4
e Some({[True; True; False]}) 110 3
b Some({[False; False; False]}) 000 2
f Some({[False; False; True]}) 001 2
k Some({[True; True; True; True]}) 1111 2
’ ‘ Some({[True; True; True; False]}) 1110 2

This table is demonstrating that implementation is correct. Occurrences whose frequencies are higher have the shortest paths.

A diagram summarizing this table could be like the following: Tree

Test online to compute a Huffman tree

Try huffman implementation compiled with fable.

occurrency binary frequency
a 10 5

Compression summary

 

Storage

The occurrency ‘a’ will be coded in 2 bits. We can not write less than 8 bits in a stream. (with the WriteByte method) So I wrote a tiny BitWritter:

type BitWriter(stream:Stream) =
  let buffer = ref 0uy
  let len = ref 0
  let flush() =
    while !len < 8 do
      buffer := !buffer <<< 1
      buffer := !buffer ||| 0uy
      len := !len + 1
    stream.WriteByte(!buffer)
    stream.Flush()
    buffer := 0uy
    len := 0
  let mustFlush() =
    !len >= 8
  member __.Flush() =
    if mustFlush() then flush()
  member __.Close() =
    if !len > 0 then flush()
  member __.Write(b:bit) =
    let v = if b then 1uy else 0uy
    buffer := ((!buffer) <<< 1) ||| v
    len := !len + 1
    __.Flush()
  member __.Write(bits:bit list) =
    for b in bits do __.Write b
  interface IDisposable with
    member __.Dispose() =
      __.Close()

The buffer is a simple byte. The write method increases the len and shift bits of buffer. When len is equal to 8 bits, we write the byte in the stream.

To read bit per bit in a stream, I use:

type BitReader(stream:Stream) =
  let buffer = ref 0uy
  let len = ref 0
  let position = ref 0L
  let loadBuffer() =
    if stream.Position >= stream.Length
    then buffer := 0uy
    len := 8
    let by = stream.ReadByte()
    if by = -1
    then buffer := 0uy
    buffer := byte by
  let readBit() =
    len := !len - 1
    let mask = 1 <<< !len
    let v = !buffer &&& (byte mask)
    position := !position + 1L
    v >= 1uy
  let peekBit() =
    let mask = 1 <<< (!len - 1)
    let v = !buffer &&& (byte mask)
    v >= 1uy
  member __.End with get() = stream.Position >= stream.Length && !len <= 0
  member __.Position with get() = position
  member __.Read() =
    if __.End
    then None
    else
      if !len <= 0
      then loadBuffer()
      Some (readBit())
  member __.Peek() =
    if __.End
    then None
    else
      if !len <= 0
      then loadBuffer()
      Some (peekBit())

Try this implementation on real files using gist