10.3. Run-Length Encoding
File:
RunLengthEncoding.ml
Run-length encoding is a compression methods that works well with bit-strings with large contiguous segments of repeating 0s and 1s by encoding the lengths of such segments in the interleaved fashion, starting from 0. For instance, the following string:
0000000000000001111111000000011111111111
can be encoded via 4-bit integer representation as follows:
1111011101111011
This encoding means that we have 15 (1111 in binary) 0s, then 7 (0111 in binary) 1s, then 7 0s and finally 11 (0111) ones.
10.3.1. Design Considerations
In order to turn this example into an effective data compression method, we need to answer the following questions:
How many bits do we use for representing the counts?
What if we encounter a sequence longer that a maximal value of a counter encoding permits?
What to do about short runs that under-use the length encoding?
We resolve those questions in the following way
Counts are between 0 and 255, so we use 8-bit representations.
We make all run lengths less than 256 by including runs of length 0 if needed
We encode short runs even though doing so might lengthen the output encoding.
10.3.2. Implementation
The resulting encoding procedure, which makes use of the previously implemented queue data type, is as follows:
open Queues
open DLLBasedQueue
open BinaryEncodings
open Extlib.IO
let read_next_bit input = try
let bit = read_bits input 1 in
Some bit
with BatInnerIO.No_more_input -> None
It starts by reading the lengths of contiguous interleaving bit sequences from the file input
to a queue:
let compute_lengths input =
let m = 256 in
let q = mk_queue 100 in
let rec read_segments acc b =
if acc >= m - 1 then begin
enqueue q acc;
read_segments 0 (1 - b)
end
else match read_next_bit input with
| Some x ->
if x = b
then read_segments (acc + 1) b
else begin
enqueue q acc;
read_segments 1 (1 - b)
end
| None -> enqueue q acc
in
read_segments 0 0;
queue_to_list q
The obtained list is then used to write the corresponding bytes to the output channel:
let compress_binary_via_rle binary new_binary =
let segments = read_from_binary compute_lengths binary in
let rec loop out segs = match segs with
| [] -> ()
| h :: t -> (write_bits out ~nbits:8 h;
loop out t)
in
write_to_binary loop new_binary segments
The decoder is implemented similarly: the RLE decompression is simply a reversal of the initial compression. Reading 8 bits at a time from the compressed file gives us the list of segment lengths. To write this to the new binary, we just print the given number of alternating 1’s and 0’s in sequence:
let read_next_char input = try
let len = read_bits input 8 in
Some len
with BatInnerIO.No_more_input -> None
let read_lengths input =
let q = mk_queue 100 in
let rec loop _ = match read_next_char input with
| Some l -> (enqueue q l; loop ())
| None -> ()
in
loop ();
queue_to_list q
let rec write_bits_from_length out (l, b) = match l with
| [] -> ()
| h :: t -> begin
for i = 0 to h - 1 do
write_bits out ~nbits:1 b
done;
write_bits_from_length out (t, 1 - b)
end
let decompress_via_rle_into_binary archive new_binary =
let lengths = read_from_binary read_lengths archive in
write_to_binary write_bits_from_length new_binary (lengths, 0)
With the implementation of compression/decompression at place, we can test them on the particular DNA sequences:
let dna_rle_compression_test d =
let dna = "dna.tmp" in
let rle = "dna.rle" in
let dna' = "dna.new" in
write_dna_to_binary dna d;
compress_binary_via_rle dna rle;
decompress_via_rle_into_binary rle dna';
let d' = read_dna_from_binary dna' in
Sys.remove dna;
Sys.remove rle;
Sys.remove dna';
d = d'
let%test _ = dna_rle_compression_test dna_string1
let%test _ = dna_rle_compression_test dna_string2
let%test _ = dna_rle_compression_test dna_string3
let%test _ = dna_rle_compression_test dna_string4