Toy compression implementations: Difference between revisions

From HaskellWiki
(Lots of lushous example code.)
 
(use laziness and HOFs to dramatically shrink lzw funcs)
 
(4 intermediate revisions by 2 users not shown)
Line 1: Line 1:
[[Category:Code]]
[[Category:Code]]
== About ==


This code is provided in the hope that someone might find it interesting/entertaining, and to demonstrate what an excellent programming language Haskell truly is. (A working polymorphic LZW implementation in 10 lines? Try ''that'' in Java!)
This code is provided in the hope that someone might find it interesting/entertaining, and to demonstrate what an excellent programming language Haskell truly is. (A working polymorphic LZW implementation in 10 lines? Try ''that'' in Java!)


This is 'toy' code. Please don't try to use it to compress multi-GB of data. It has not been thoroughly checked for correctly, and I shudder to think what the time and space complexity would be like! However, it is enlightening and entertaining to see how many algorithms you can implement with a handful of lines...
This is 'toy' code. Please don't try to use it to compress multi-GB of data. It has not been thoroughly checked for correctness, and I shudder to think what the time and space complexity would be like! However, it is enlightening and entertaining to see how many algorithms you can implement with a handful of lines...


[[User:MathematicalOrchid|MathematicalOrchid]] 16:46, 15 February 2007 (UTC)
[[User:MathematicalOrchid|MathematicalOrchid]] 16:46, 15 February 2007 (UTC)
== Main module ==


<haskell>
<haskell>
module Compression where
module Compression where


import Data.List
import List
import Maybe
import IO (hFlush, stdout)


chars = [' '..'~']  -- Becuase ' ' = 0x20 and '~' = 0x7F.


-- Run-length encoding
-- Run-length encoding


encode_RLE :: (Eq x) => [x] -> [(Int,x)]
encode_RLE :: (Eq t) => [t] -> [(Int,t)]
encode_RLE = map (\xs -> (length xs, head xs)) . groupBy (==)
encode_RLE = map (\xs -> (length xs, head xs)) . groupBy (==)


decode_RLE :: [(Int,x)] -> [x]
decode_RLE :: [(Int,t)] -> [t]
decode_RLE = concatMap (uncurry replicate)
decode_RLE = concatMap (uncurry replicate)




-- Limpel-Ziv-Welsh compression (Recommend using [Word8] or [SmallAlpha] for input!)
-- Limpel-Ziv-Welch encoding


encode_LZW :: (Eq x, Enum x, Bounded x) => [x] -> [Int]
encode_LZW :: (Eq t) => [t] -> [t] -> [Int]
encode_LZW [] = []
encode_LZW alphabet = work (map (:[]) alphabet) where
encode_LZW (x:xs) = work init [x] xs where
   chunk pred lst = last . takeWhile (pred . fst) . tail $ zip (inits lst) (tails lst)
   init = map (\x -> [x]) $ enumFromTo minBound maxBound
   work table [] = []
   work table buffer []     = [maybe undefined id (elemIndex buffer table)]
   work table lst = fromJust (elemIndex tok table) : work (table ++ [tok ++ [head rst]]) rst
   work table buffer (x:xs) =
    where (tok, rst) = chunk (`elem` table) lst
    let new = buffer ++ [x]
    in  case elemIndex new table of
          Nothing -> maybe undefined id (elemIndex buffer table) : work (table ++ [new]) [x] xs
          Just _  -> work table new xs


-- TODO: Matching decode_LZW function.
decode_LZW :: [t] -> [Int] -> [t]
decode_LZW alphabet xs = concat output where
  output = map (table !!) xs
  table = map (:[]) alphabet ++ zipWith (++) output (map (take 1) (tail output))


-- TODO: Huffman encoding.
main = do x <- take 20000 `fmap` readFile "/usr/share/dict/words"
          let l = length x `div` 80
              a = ['\0' .. '\255']
      eq a b | a == b    = putChar '=' >> hFlush stdout
            | otherwise = error "data error"
      cmp = zipWith eq x . decode_LZW a . encode_LZW a $ x
              vl = map head $ unfoldr (\cm -> case cm of [] -> Nothing ; _ -> Just (splitAt l cm)) cmp
          sequence_ vl


-- TODO: Arithmetic coding.
</haskell>
</haskell>


It may also be useful to add the following for test purposes:
Some examples are in order:


<haskell>
<haskell>
import Data.Word
> encode_RLE "AAAABBBBDDCCCCEEEGGFFFF"
 
[(4,'A'),(4,'B'),(2,'D'),(4,'C'),(3,'E'),(2,'G'),(4,'F')]
 
 
> decode_RLE [(4,'A'),(4,'B'),(2,'D'),(4,'C'),(3,'E'),(2,'G'),(4,'F')]
 
"AAAABBBBDDCCCCEEEGGFFFF"


data SmallAlpha = AA | BB | CC | DD deriving (Show, Eq, Ord, Enum, Bounded)


parse1 'a' = AA
> encode_LZW chars "This is just a simple test."
parse1 'b' = BB
parse1 'c' = CC
parse1  _  = DD -- For safety


parse = map parse1
[52,72,73,83,0,97,0,74,85,83,84,0,65,0,83,73,77,80,76,69,0,84,69,104,14]
 
 
> decode_LZW chars [52,72,73,83,0,97,0,74,85,83,84,0,65,0,83,73,77,80,76,69,0,84,69,104,14]
 
"This is just a simple test."
</haskell>
</haskell>


Anybody know how to use <hask>newtype</hask> to make a type like <hask>Char</hask> but with <hask>minBound</hask> and <hask>maxBound</hask> much closer together?
== Huffman coding ==
 
<haskell>
module Huffman
    (count, markov1, Tree, encode_huffman, decode_huffman)
  where
 
import Data.List (nub)
 
-- Marvok1 probability model...
 
count :: (Eq t) => [t] -> [(t,Int)]
count xs = map (\x -> (x, length $ filter (x ==) xs)) $ nub xs
 
markov1 :: (Eq t) => [t] -> [(t,Double)]
markov1 xs =
  let n = fromIntegral $ length xs
  in  map (\(x,c) -> (x, fromIntegral c / n)) $ count xs
 
 
-- Build a Huffman tree...
 
data Tree t = Leaf Double t | Branch Double (Tree t) (Tree t) deriving Show
 
prob :: Tree t -> Double
prob (Leaf  p _)  = p
prob (Branch p _ _) = p
 
get_tree :: [Tree t] -> (Tree t, [Tree t])
get_tree (t:ts) = work t [] ts where
  work x xs [] = (x,xs)
  work x xs (y:ys)
    | prob y < prob x = work y (x:xs) ys
    | otherwise      = work x (y:xs) ys
 
huffman_build :: [(t,Double)] -> Tree t
huffman_build = build . map (\(t,p) -> Leaf p t) where
  build [t] = t
  build ts =
    let (t0,ts0) = get_tree ts
        (t1,ts1) = get_tree ts0
    in  build $ Branch (prob t0 + prob t1) t0 t1 : ts1
 
 
-- Make codebook...
 
data Bit  = Zero | One deriving (Eq, Show)
type Bits = [Bit]
 
huffman_codebook :: Tree t -> [(t,Bits)]
huffman_codebook = work [] where
  work bs (Leaf _ x) = [(x,bs)]
  work bs (Branch _ t0 t1) = work (bs ++ [Zero]) t0 ++ work (bs ++ [One]) t1
 
 
-- Do the coding!
 
encode :: (Eq t) => [(t,Bits)] -> [t] -> Bits
encode cb = concatMap (\x -> maybe undefined id $ lookup x cb)
 
decode :: (Eq t) => Tree t -> Bits -> [t]
decode t = work t t where
  work _ (Leaf  _ x)        []  = [x]
  work t (Leaf  _ x)        bs  = x : work t t bs
  work t (Branch _ t0 t1) (b:bs)
    | b == Zero = work t t0 bs
    | otherwise = work t t1 bs
 
encode_huffman :: (Eq t) => [t] -> (Tree t, Bits)
encode_huffman xs =
  let t  = huffman_build $ markov1 xs
      bs = encode (huffman_codebook t) xs
  in (t,bs)
 
decode_huffman :: (Eq t) => Tree t -> Bits -> [t]
decode_huffman = decode
</haskell>
 
If anybody can make this code shorter / more elegant, feel free!
 
A short demo:
<haskell>
> encode_huffman "this is just a simple test"
<loads of data>
 
> decode_huffman (fst it) (snd it)
"this is just a simple test"
</haskell>

Latest revision as of 01:59, 9 March 2007


About

This code is provided in the hope that someone might find it interesting/entertaining, and to demonstrate what an excellent programming language Haskell truly is. (A working polymorphic LZW implementation in 10 lines? Try that in Java!)

This is 'toy' code. Please don't try to use it to compress multi-GB of data. It has not been thoroughly checked for correctness, and I shudder to think what the time and space complexity would be like! However, it is enlightening and entertaining to see how many algorithms you can implement with a handful of lines...

MathematicalOrchid 16:46, 15 February 2007 (UTC)

Main module

module Compression where

import List
import Maybe
import IO (hFlush, stdout)

chars = [' '..'~']   -- Becuase ' ' = 0x20 and '~' = 0x7F.

-- Run-length encoding

encode_RLE :: (Eq t) => [t] -> [(Int,t)]
encode_RLE = map (\xs -> (length xs, head xs)) . groupBy (==)

decode_RLE :: [(Int,t)] -> [t]
decode_RLE = concatMap (uncurry replicate)


-- Limpel-Ziv-Welch encoding

encode_LZW :: (Eq t) => [t] -> [t] -> [Int]
encode_LZW alphabet = work (map (:[]) alphabet) where
  chunk pred lst = last . takeWhile (pred . fst) . tail $ zip (inits lst) (tails lst)
  work table []  = []
  work table lst = fromJust (elemIndex tok table) : work (table ++ [tok ++ [head rst]]) rst
    where (tok, rst) = chunk (`elem` table) lst

decode_LZW :: [t] -> [Int] -> [t]
decode_LZW alphabet xs = concat output where
  output = map (table !!) xs
  table = map (:[]) alphabet ++ zipWith (++) output (map (take 1) (tail output))

main = do x <- take 20000 `fmap` readFile "/usr/share/dict/words"
          let l = length x `div` 80
              a = ['\0' .. '\255']
	      eq a b | a == b    = putChar '=' >> hFlush stdout
	             | otherwise = error "data error"
	      cmp = zipWith eq x . decode_LZW a . encode_LZW a $ x
              vl = map head $ unfoldr (\cm -> case cm of [] -> Nothing ; _ -> Just (splitAt l cm)) cmp
          sequence_ vl

Some examples are in order:

> encode_RLE "AAAABBBBDDCCCCEEEGGFFFF"

[(4,'A'),(4,'B'),(2,'D'),(4,'C'),(3,'E'),(2,'G'),(4,'F')]


> decode_RLE [(4,'A'),(4,'B'),(2,'D'),(4,'C'),(3,'E'),(2,'G'),(4,'F')]

"AAAABBBBDDCCCCEEEGGFFFF"


> encode_LZW chars "This is just a simple test."

[52,72,73,83,0,97,0,74,85,83,84,0,65,0,83,73,77,80,76,69,0,84,69,104,14]


> decode_LZW chars [52,72,73,83,0,97,0,74,85,83,84,0,65,0,83,73,77,80,76,69,0,84,69,104,14]

"This is just a simple test."

Huffman coding

module Huffman
    (count, markov1, Tree, encode_huffman, decode_huffman)
  where

import Data.List (nub)

-- Marvok1 probability model...

count :: (Eq t) => [t] -> [(t,Int)]
count xs = map (\x -> (x, length $ filter (x ==) xs)) $ nub xs

markov1 :: (Eq t) => [t] -> [(t,Double)]
markov1 xs =
  let n = fromIntegral $ length xs
  in  map (\(x,c) -> (x, fromIntegral c / n)) $ count xs


-- Build a Huffman tree...

data Tree t = Leaf Double t | Branch Double (Tree t) (Tree t) deriving Show

prob :: Tree t -> Double
prob (Leaf   p _)   = p
prob (Branch p _ _) = p

get_tree :: [Tree t] -> (Tree t, [Tree t])
get_tree (t:ts) = work t [] ts where
  work x xs [] = (x,xs)
  work x xs (y:ys)
    | prob y < prob x = work y (x:xs) ys
    | otherwise       = work x (y:xs) ys

huffman_build :: [(t,Double)] -> Tree t
huffman_build = build . map (\(t,p) -> Leaf p t) where
  build [t] = t
  build ts =
    let (t0,ts0) = get_tree ts
        (t1,ts1) = get_tree ts0
    in  build $ Branch (prob t0 + prob t1) t0 t1 : ts1


-- Make codebook...

data Bit  = Zero | One deriving (Eq, Show)
type Bits = [Bit]

huffman_codebook :: Tree t -> [(t,Bits)]
huffman_codebook = work [] where
  work bs (Leaf _ x) = [(x,bs)]
  work bs (Branch _ t0 t1) = work (bs ++ [Zero]) t0 ++ work (bs ++ [One]) t1


-- Do the coding!

encode :: (Eq t) => [(t,Bits)] -> [t] -> Bits
encode cb = concatMap (\x -> maybe undefined id $ lookup x cb)

decode :: (Eq t) => Tree t -> Bits -> [t]
decode t = work t t where
  work _ (Leaf   _ x)        []  = [x]
  work t (Leaf   _ x)        bs  = x : work t t bs
  work t (Branch _ t0 t1) (b:bs)
    | b == Zero = work t t0 bs
    | otherwise = work t t1 bs

encode_huffman :: (Eq t) => [t] -> (Tree t, Bits)
encode_huffman xs =
  let t  = huffman_build $ markov1 xs
      bs = encode (huffman_codebook t) xs
  in (t,bs)

decode_huffman :: (Eq t) => Tree t -> Bits -> [t]
decode_huffman = decode

If anybody can make this code shorter / more elegant, feel free!

A short demo:

> encode_huffman "this is just a simple test"
<loads of data>

> decode_huffman (fst it) (snd it)
"this is just a simple test"