Difference between revisions of "Toy compression implementations"
From HaskellWiki
(fix the LZW code) 
(use laziness and HOFs to dramatically shrink lzw funcs) 

Line 14:  Line 14:  
module Compression where 
module Compression where 

−  import 
+  import List 
−  import 
+  import Maybe 
+  import IO (hFlush, stdout) 

chars = [' '..'~']  Becuase ' ' = 0x20 and '~' = 0x7F. 
chars = [' '..'~']  Becuase ' ' = 0x20 and '~' = 0x7F. 

−  
 Runlength encoding 
 Runlength encoding 

Line 27:  Line 26:  
decode_RLE :: [(Int,t)] > [t] 
decode_RLE :: [(Int,t)] > [t] 

decode_RLE = concatMap (uncurry replicate) 
decode_RLE = concatMap (uncurry replicate) 

+  
 LimpelZivWelch encoding 
 LimpelZivWelch encoding 

encode_LZW :: (Eq t) => [t] > [t] > [Int] 
encode_LZW :: (Eq t) => [t] > [t] > [Int] 

−  encode_LZW 
+  encode_LZW alphabet = work (map (:[]) alphabet) where 
−  +  chunk pred lst = last . takeWhile (pred . fst) . tail $ zip (inits lst) (tails lst) 

−  +  work table [] = [] 

−  work table 
+  work table lst = fromJust (elemIndex tok table) : work (table ++ [tok ++ [head rst]]) rst 
−  +  where (tok, rst) = chunk (`elem` table) lst 

−  let new = buffer ++ [x] 

−  in case elemIndex new table of 

−  Nothing > maybe undefined id (elemIndex buffer table) : work (table ++ [new]) [x] xs 

−  Just _ > work table new xs 

decode_LZW :: [t] > [Int] > [t] 
decode_LZW :: [t] > [Int] > [t] 

−  decode_LZW 
+  decode_LZW alphabet xs = concat output where 
−  +  output = map (table !!) xs 

−  +  table = map (:[]) alphabet ++ zipWith (++) output (map (take 1) (tail output)) 

−  +  
−  +  main = do x < take 20000 `fmap` readFile "/usr/share/dict/words" 

−  +  let l = length x `div` 80 

−  +  a = ['\0' .. '\255'] 

−  +  eq a b  a == b = putChar '=' >> hFlush stdout 

−  +   otherwise = error "data error" 

+  cmp = zipWith eq x . decode_LZW a . encode_LZW a $ x 

+  vl = map head $ unfoldr (\cm > case cm of [] > Nothing ; _ > Just (splitAt l cm)) cmp 

+  sequence_ vl 

</haskell> 
</haskell> 
Latest revision as of 01:59, 9 March 2007
About
This code is provided in the hope that someone might find it interesting/entertaining, and to demonstrate what an excellent programming language Haskell truly is. (A working polymorphic LZW implementation in 10 lines? Try that in Java!)
This is 'toy' code. Please don't try to use it to compress multiGB of data. It has not been thoroughly checked for correctness, and I shudder to think what the time and space complexity would be like! However, it is enlightening and entertaining to see how many algorithms you can implement with a handful of lines...
MathematicalOrchid 16:46, 15 February 2007 (UTC)
Main module
module Compression where
import List
import Maybe
import IO (hFlush, stdout)
chars = [' '..'~']  Becuase ' ' = 0x20 and '~' = 0x7F.
 Runlength encoding
encode_RLE :: (Eq t) => [t] > [(Int,t)]
encode_RLE = map (\xs > (length xs, head xs)) . groupBy (==)
decode_RLE :: [(Int,t)] > [t]
decode_RLE = concatMap (uncurry replicate)
 LimpelZivWelch encoding
encode_LZW :: (Eq t) => [t] > [t] > [Int]
encode_LZW alphabet = work (map (:[]) alphabet) where
chunk pred lst = last . takeWhile (pred . fst) . tail $ zip (inits lst) (tails lst)
work table [] = []
work table lst = fromJust (elemIndex tok table) : work (table ++ [tok ++ [head rst]]) rst
where (tok, rst) = chunk (`elem` table) lst
decode_LZW :: [t] > [Int] > [t]
decode_LZW alphabet xs = concat output where
output = map (table !!) xs
table = map (:[]) alphabet ++ zipWith (++) output (map (take 1) (tail output))
main = do x < take 20000 `fmap` readFile "/usr/share/dict/words"
let l = length x `div` 80
a = ['\0' .. '\255']
eq a b  a == b = putChar '=' >> hFlush stdout
 otherwise = error "data error"
cmp = zipWith eq x . decode_LZW a . encode_LZW a $ x
vl = map head $ unfoldr (\cm > case cm of [] > Nothing ; _ > Just (splitAt l cm)) cmp
sequence_ vl
Some examples are in order:
> encode_RLE "AAAABBBBDDCCCCEEEGGFFFF"
[(4,'A'),(4,'B'),(2,'D'),(4,'C'),(3,'E'),(2,'G'),(4,'F')]
> decode_RLE [(4,'A'),(4,'B'),(2,'D'),(4,'C'),(3,'E'),(2,'G'),(4,'F')]
"AAAABBBBDDCCCCEEEGGFFFF"
> encode_LZW chars "This is just a simple test."
[52,72,73,83,0,97,0,74,85,83,84,0,65,0,83,73,77,80,76,69,0,84,69,104,14]
> decode_LZW chars [52,72,73,83,0,97,0,74,85,83,84,0,65,0,83,73,77,80,76,69,0,84,69,104,14]
"This is just a simple test."
Huffman coding
module Huffman
(count, markov1, Tree, encode_huffman, decode_huffman)
where
import Data.List (nub)
 Marvok1 probability model...
count :: (Eq t) => [t] > [(t,Int)]
count xs = map (\x > (x, length $ filter (x ==) xs)) $ nub xs
markov1 :: (Eq t) => [t] > [(t,Double)]
markov1 xs =
let n = fromIntegral $ length xs
in map (\(x,c) > (x, fromIntegral c / n)) $ count xs
 Build a Huffman tree...
data Tree t = Leaf Double t  Branch Double (Tree t) (Tree t) deriving Show
prob :: Tree t > Double
prob (Leaf p _) = p
prob (Branch p _ _) = p
get_tree :: [Tree t] > (Tree t, [Tree t])
get_tree (t:ts) = work t [] ts where
work x xs [] = (x,xs)
work x xs (y:ys)
 prob y < prob x = work y (x:xs) ys
 otherwise = work x (y:xs) ys
huffman_build :: [(t,Double)] > Tree t
huffman_build = build . map (\(t,p) > Leaf p t) where
build [t] = t
build ts =
let (t0,ts0) = get_tree ts
(t1,ts1) = get_tree ts0
in build $ Branch (prob t0 + prob t1) t0 t1 : ts1
 Make codebook...
data Bit = Zero  One deriving (Eq, Show)
type Bits = [Bit]
huffman_codebook :: Tree t > [(t,Bits)]
huffman_codebook = work [] where
work bs (Leaf _ x) = [(x,bs)]
work bs (Branch _ t0 t1) = work (bs ++ [Zero]) t0 ++ work (bs ++ [One]) t1
 Do the coding!
encode :: (Eq t) => [(t,Bits)] > [t] > Bits
encode cb = concatMap (\x > maybe undefined id $ lookup x cb)
decode :: (Eq t) => Tree t > Bits > [t]
decode t = work t t where
work _ (Leaf _ x) [] = [x]
work t (Leaf _ x) bs = x : work t t bs
work t (Branch _ t0 t1) (b:bs)
 b == Zero = work t t0 bs
 otherwise = work t t1 bs
encode_huffman :: (Eq t) => [t] > (Tree t, Bits)
encode_huffman xs =
let t = huffman_build $ markov1 xs
bs = encode (huffman_codebook t) xs
in (t,bs)
decode_huffman :: (Eq t) => Tree t > Bits > [t]
decode_huffman = decode
If anybody can make this code shorter / more elegant, feel free!
A short demo:
> encode_huffman "this is just a simple test"
<loads of data>
> decode_huffman (fst it) (snd it)
"this is just a simple test"