google-research · cyntsh · Aug 6, 2021 · Aug 6, 2021 · Aug 10, 2021 · Aug 11, 2021
diff --git a/examples/arithmetic-coding.dx b/examples/arithmetic-coding.dx
@@ -0,0 +1,102 @@
+'## [Arithmetic coding](https://en.wikipedia.org/wiki/Arithmetic_coding)
+This demonstrates a lossless method for compression on a string of letters.
+Rather than assigning a code to each letter, the entire string is encoded
+into a single floating-point number.
+
+Alphabet = Fin 26
+Interval = (Float&Float)
+top:Interval = (0.,1.)
+
+def charToIdx (c: Word8) : Int = W8ToI c - W8ToI 'a'
+def idxToChar (i: Int) : Word8 = IToW8 (i + (W8ToI 'a'))
+
+'### Statistical modelling
+First, model the probability of each letter given by the string to be encoded.
+
+def cumProb (ps: n=>Float) : n=>Float =
+  withState 0.0 \total.
+    for i. if ps.i > 0. 
+      then
+        currTotal = get total
+        newTotal = currTotal + ps.i
+        total := newTotal
+        currTotal
+      else 0.
+
+def getFrequency (str: (Fin l)=>Word8) : Alphabet=>Int =
+  a: Alphabet => Int = zero
+  yieldState a \ref. for i. 
+    i' = (charToIdx str.i)@_
+    ref!i' := (get ref).i' + 1
+
+def getProbability (l: Int) (freq: Alphabet=>Int) : Alphabet=>(Float&Float) =
+  probs = for i. IToF freq.i / IToF l
+  cums = cumProb probs
+  for i. (probs.i, cums.i)
+
+'### Scaling functions
+
+def getUpdateRule (p: Alphabet=>(Float&Float)) : Alphabet=>(Interval->Interval) =
+  for i. 
+    case p.i == (0.,0.) of
+      True -> id
+      False ->
+        \(x, w).
+          x' = x + w*(snd p.i)
+          w' = w*(fst p.i)
+          (x', w')
+
+def subdivide (str: (Fin l)=>Word8) 
+              (rule: Alphabet=>(Interval->Interval)) 
+              (i: (Fin l)) (in: Interval) : Interval =
+  updateInterval = rule.((charToIdx str.i)@_)
+  updateInterval in
+
+def findInterval  (l: Int) 
+                  (code: Float) 
+                  (rule: Alphabet=>(Interval->Interval))
+                  (i: (Fin l))
+                  ((str,in): (List Word8 & Interval)) : (List Word8 & Interval) =
+  (letter, in') = boundedIter (size Alphabet) (' ', top) \j.
+    case rule.(j@_) in == in of
+      True -> Continue
+      False ->
+        (x, w) = rule.(j@_) in 
+        case code >= x && code < (x+w) of
+          True -> Done (idxToChar j, (x,w))
+          False -> Continue
+  (str <> AsList 1 [letter], in')
+
+'### Coding interface
+Start from an initial interval, [0, 1). 
+For each letter encoded from the string, the current interval is divided based on the
+cumulative probability of all letters, then updated to the partition that matches
+the encoded letter.
+The decoding process retraces the steps of the encoding process to recover the correct letters.
+
+def encode (str: (Fin l)=>Word8) (rule: Alphabet=>(Interval->Interval)) : Float =
+  update = subdivide str rule
+  finalInterval = fold top update
+  fst finalInterval + (snd finalInterval)/2.
+
+def decode (l: Int) (code: Float) (rule: Alphabet=>(Interval->Interval)) : List Word8 =
+  update = findInterval l code rule
+  initStr: List Word8 = AsList _ []
+  fst $ fold (initStr, top) update
+
+'### Demo: Lossless compression on a test string
+
+str' = "abbadcabccdd"
+(AsList l str) = str'
+
+p = getProbability l $ getFrequency str
+r = getUpdateRule p
+
+code = encode str r
+code
+> 0.081569
+
+decoded = decode l code r
+decoded == str'
+> True
+