10.3. Run-Length Encoding

  • File: RunLengthEncoding.ml

Run-length encoding is a compression methods that works well with bit-strings with large contiguous segments of repeating 0s and 1s by encoding the lengths of such segments in the interleaved fashion, starting from 0. For instance, the following string:

0000000000000001111111000000011111111111

can be encoded via 4-bit integer representation as follows:

1111011101111011

This encoding means that we have 15 (1111 in binary) 0s, then 7 (0111 in binary) 1s, then 7 0s and finally 11 (0111) ones.

10.3.1. Design Considerations

In order to turn this example into an effective data compression method, we need to answer the following questions:

  • How many bits do we use for representing the counts?

  • What if we encounter a sequence longer that a maximal value of a counter encoding permits?

  • What to do about short runs that under-use the length encoding?

We resolve those questions in the following way

  • Counts are between 0 and 255, so we use 8-bit representations.

  • We make all run lengths less than 256 by including runs of length 0 if needed

  • We encode short runs even though doing so might lengthen the output encoding.

10.3.2. Implementation

The resulting encoding procedure, which makes use of the previously implemented queue data type, is as follows:

open Queues
open DLLBasedQueue
open BinaryEncodings

open Extlib.IO


let read_next_bit input = try
    let bit = read_bits input 1 in
    Some bit
  with BatInnerIO.No_more_input -> None

It starts by reading the lengths of contiguous interleaving bit sequences from the file input to a queue:

let compute_lengths input =
  let m = 256 in
  let q = mk_queue 100 in
  let rec read_segments acc b =
    if acc >= m - 1 then begin
      enqueue q acc;
      read_segments 0 (1 - b)
    end
    else match read_next_bit input with
      | Some x ->
        if x = b
        then read_segments (acc + 1) b
        else begin
          enqueue q acc;
          read_segments 1 (1 - b)
        end
      | None -> enqueue q acc
  in
  read_segments 0 0;
  queue_to_list q

The obtained list is then used to write the corresponding bytes to the output channel:

let compress_binary_via_rle binary new_binary =
  let segments = read_from_binary compute_lengths binary in
  let rec loop out segs = match segs with
    | [] -> ()
    | h :: t -> (write_bits out ~nbits:8 h;
                 loop out t)
  in
  write_to_binary loop new_binary segments

The decoder is implemented similarly: the RLE decompression is simply a reversal of the initial compression. Reading 8 bits at a time from the compressed file gives us the list of segment lengths. To write this to the new binary, we just print the given number of alternating 1’s and 0’s in sequence:

let read_next_char input = try
    let len = read_bits input 8 in
    Some len
  with BatInnerIO.No_more_input -> None

let read_lengths input =
  let q = mk_queue 100 in
  let rec loop _ = match read_next_char input with
    | Some l -> (enqueue q l; loop ())
    | None -> ()
  in
  loop ();
  queue_to_list q

let rec write_bits_from_length out (l, b) =  match l with
  | [] -> ()
  | h :: t -> begin
      for i = 0 to h - 1 do
        write_bits out ~nbits:1 b
      done;
      write_bits_from_length out (t, 1 - b)
    end

let decompress_via_rle_into_binary archive new_binary =
  let lengths = read_from_binary read_lengths archive in
  write_to_binary write_bits_from_length new_binary (lengths, 0)

With the implementation of compression/decompression at place, we can test them on the particular DNA sequences:

let dna_rle_compression_test d =
  let dna = "dna.tmp" in
  let rle = "dna.rle" in
  let dna' = "dna.new" in
  write_dna_to_binary dna d;
  compress_binary_via_rle dna rle;
  decompress_via_rle_into_binary rle dna';
  let d' = read_dna_from_binary dna' in
  Sys.remove dna;
  Sys.remove rle;
  Sys.remove dna';
  d = d'

let%test _ = dna_rle_compression_test dna_string1
let%test _ = dna_rle_compression_test dna_string2
let%test _ = dna_rle_compression_test dna_string3
let%test _ = dna_rle_compression_test dna_string4