13 votes

Programming Challenge: Text compression

Posted May 24, 2019 by gpl (edited May 24, 2019)

Tags: challenge.programming, compression, coding

In an effort to make these weekly, I present a new programming challenge.

The challenge this week is to compress some text using a prefix code. Prefix codes associate each letter with a given bit string, such that no encoded bitstring is the prefix of any other. These bit strings are then concatenated into one long integer which is separated into bytes for ease of reading. These bytes can be represented as hex values as well. The provided prefix encoding is as follows:

char	value	char	value
' '	11	'e'	101
't'	1001	'o'	10001
'n'	10000	'a'	011
's'	0101	'i'	01001
'r'	01000	'h'	0011
'd'	00101	'l'	001001
'~'	001000	'u'	00011
'c'	000101	'f'	000100
'm'	000011	'p'	0000101
'g'	0000100	'w'	0000011
'b'	0000010	'y'	0000001
'v'	00000001	'j'	000000001
'k'	0000000001	'x'	00000000001
'q'	000000000001	'z'	000000000000

Challenge

Your program should accept a lowercase string (including the ~ character), and should output the formatted compressed bit string in binary and hex. Your final byte should be 0 padded so that it has 8 bits as required. For your convenience, here is the above table in a text file for easy read-in.

Example

Here is an example:

$> tildes ~comp
10010100 10010010 01011010 10111001 00000010 11000100 00110000 10100000
94 92 5A B9 02 C4 30 A0

Bonuses

Print the data compression ratio for a given compression, assuming the original input was encoded in 8 bit ASCII (one byte per character).
~~2. Output the ASCII string corresponding to the encoded byte string in addition to the above outputs.~~
@onyxleopard points out that many bytes won't actually be valid ASCII. Instead, do as they suggested and treat each byte as an ordinal value and print it as if encoded as UTF-8.
An input prefixed by 'D' should be interpreted as an already compressed string using this encoding, and should be decompressed (by inverting the above procedure).

Previous Challenges (I am aware of prior existing ones, but it is hard to collect them as they were irregular. Thus I list last week's challenge as 'Week 1')
Week 1

6 comments

[3]

onyxleopard

May 24, 2019 (edited May 25, 2019)

Link

The encoded bytes are not guaranteed to be valid ASCII… E.g., in your example, there are only 3 bytes of 8 that are in the ordinal range [0..128): In [1]: hex_words = '94 92 5A B9 02 C4 30...

Output the ASCII string corresponding to the encoded byte string in addition to the above outputs.

The encoded bytes are not guaranteed to be valid ASCII… E.g., in your example, there are only 3 bytes of 8 that are in the ordinal range [0..128):

In [1]: hex_words = '94 92 5A B9 02 C4 30 A0'.split()                                                                                        

In [2]: dict([(x, int(x, 16)) for x in hex_words if (int(x, 16) in range(128))])                                                                     
Out[2]: {'5A': 90, '02': 2, '30': 48}

For the second bonus, I decided to treat each byte as an ordinal value and print it as if encoded as UTF-8.

Python3 solution

#!/usr/bin/env python3

"""Compress/decompress text according to a simple prefix code map.

When compressing, data is written as hexadecimal to stdout with a prefix of 'D'.
When decompressing, data is written to stdout.
You can see additional info (printed to stderr) with the -v/--verbose option.
"""

import sys
import math

from itertools import chain, islice

# compression prefix code mapping
prefix_code = {
    ' ': '11',
    'e': '101',
    't': '1001',
    'o': '10001',
    'n': '10000',
    'a': '011',
    's': '0101',
    'i': '01001',
    'r': '01000',
    'h': '0011',
    'd': '00101',
    'l': '001001',
    '~': '001000',
    'u': '00011',
    'c': '000101',
    'f': '000100',
    'm': '000011',
    'p': '0000101',
    'g': '0000100',
    'w': '0000011',
    'b': '0000010',
    'y': '0000001',
    'v': '00000001',
    'j': '000000001',
    'k': '0000000001',
    'x': '00000000001',
    'q': '000000000001',
    'z': '000000000000',
}

def batches(iterable, n=2):
    """Batch an iterable into batches of a given size
    
    [list(b) for b in batches(range(8), n=2)] -> [
        [0, 1],
        [2, 3],
        [4, 5],
        [6, 7]
    ]
    
    [list(b) for b in batches(range(8), n=3)] -> [
        [0, 1, 2],
        [3, 4, 5],
        [6, 7]
    ]
    """
    items = iter(iterable)
    while True:
        batch = islice(items, n)
        try:
            yield chain([next(batch)], batch)
        except StopIteration:
            break

def encode(string):
    """Generate codes for each character in the given string"""
    for c in string:
        if c not in prefix_code:
            raise ValueError(f'unexpected character {c!r}')
        yield prefix_code.get(c)

def compress(codes):
    for byte in batches(chain(*codes), n=8):
        yield ''.join(byte)

def decompress(bytes_):
    """Given some bytes, generate characters from the original string"""
    codebook = {code: c for (c, code) in prefix_code.items()}
    code = ''
    for bit in chain(*bytes_):
        code += bit
        c = codebook.get(code)
        if c is not None:
            yield c
            code = ''

def main(data, verbose=False):
    """Compress a string
    (unless it begins with 'D', in which case, decompress it)
    """
    # decompress/decode
    if data.startswith('D'):
        data = islice(data, 1, None)
        ord_bytes = []
        bin_bytes = []
        hex_bytes = []
        for byte in batches(data, n=2):
            n = int(''.join(byte), 16)
            ord_bytes.append(n)
            hex_bytes.append(f'{n:02x}')
            bin_bytes.append(f'{n:0>8b}')
        if verbose:
            print('hex bytes:', *hex_bytes, file=sys.stderr)
            print('bin bytes:', *bin_bytes, file=sys.stderr)
        print(*decompress(bin_bytes), sep='')
    # compress/encode
    else:
        bin_bytes = []
        hex_bytes = []
        chr_bytes = []
        for byte in compress(encode(data)):
            n = int(f'0b{byte:0<8}', 2)
            bin_bytes.append(f'{n:0>8b}')
            hex_bytes.append(f'{n:02x}')
            chr_bytes.append(chr(n))
        if verbose:
            print('bin bytes:', *bin_bytes, sep=' ', file=sys.stderr)
            print('hex bytes:', *hex_bytes, sep=' ', file=sys.stderr)
            print('unicode bytes: ', *chr_bytes, sep='', file=sys.stderr)
            ratio = math.nan if len(data) == 0 else len(bin_bytes) / len(data)
            print('compression ratio:', ratio, file=sys.stderr)
        print('D', *hex_bytes, sep='', end='')

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=__doc__
    )
    parser.add_argument(
        'input',
        type=argparse.FileType('r'),
        help='path to input file (use "-" to read input from stdin)'
    )
    parser.add_argument(
        '-v', '--verbose',
        action='store_true',
        help='print extra info about the compression to stderr'
    )
    args = parser.parse_args()
    try:
        main(
            args.input.read().rstrip('\n'),
            verbose=args.verbose
        )
    except ValueError as e:
        print(e.__class__.__name__, e, sep=': ', file=sys.stderr)
    if args.input.name != '<stdin>':
        args.input.close()

./codec.py -h
usage: codec.py [-h] [-v] input

Compress/decompress text according to a simple prefix code map. When
compressing, data is written as hexadecimal to stdout with a prefix of 'D'.
When decompressing, data is written to stdout. You can see additional info
(printed to stderr) with the -v/--verbose option.

positional arguments:
  input          path to input file (use "-" to read input from stdin)

optional arguments:
  -h, --help     show this help message and exit
  -v, --verbose  print extra info about the compression to stderr (default:
                 False)
$ echo 'tildes ~comp' | ./codec.py - -v > tildes.txt
bin bytes: 10010100 10010010 01011010 10111001 00000010 11000100 00110000 10100000
hex bytes: 94 92 5a b9 02 c4 30 a0
unicode bytes: Z¹Ä0 
compression ratio: 0.6666666666666666
$ ./codec.py tildes.txt -v
hex bytes: 94 92 5a b9 02 c4 30 a0
bin bytes: 10010100 10010010 01011010 10111001 00000010 11000100 00110000 10100000
tildes ~comp
$ echo 'this codec compress spaces quite well                                            ' | ./codec.py - -v | ./codec.py - -v
bin bytes: 10010011 01001010 11100010 11000100 10110100 01011100 01011000 10000110 00010101 00010101 01010111 01010000 10101100 01011010 10111000 00000000 10001101 00110011 01110000 01110100 10010010 01111111 11111111 11111111 11111111 11111111 11111111 11111111 11111111 11111111 11111111 11111111 11000000
hex bytes: 93 4a e2 c4 b4 5c 58 86 15 15 57 50 ac 5a b8 00 8d 33 70 74 92 7f ff ff ff ff ff ff ff ff ff ff c0
unicode bytes: JâÄ´\XWP¬Z¸3ptÿÿÿÿÿÿÿÿÿÿÀ
compression ratio: 0.4074074074074074
hex bytes: 93 4a e2 c4 b4 5c 58 86 15 15 57 50 ac 5a b8 00 8d 33 70 74 92 7f ff ff ff ff ff ff ff ff ff ff c0
bin bytes: 10010011 01001010 11100010 11000100 10110100 01011100 01011000 10000110 00010101 00010101 01010111 01010000 10101100 01011010 10111000 00000000 10001101 00110011 01110000 01110100 10010010 01111111 11111111 11111111 11111111 11111111 11111111 11111111 11111111 11111111 11111111 11111111 11000000
this codec compress spaces quite well

Edit: The unicode thing is probably really bad if you end up printing control characters that have particular semantics depending on your terminal. (I should probably remove that…)

3 votes

[2]
gpl (OP)
May 24, 2019
Link Parent
Good catch here, I wrote up the bonuses a little quickly so that I could post before the work day started. I'll fix this now. And nice solution btw :)

The encoded bytes are not guaranteed to be valid ASCII… E.g., in your example, there are only 3 bytes of 8 that are in the ordinal range [0..128):

Good catch here, I wrote up the bonuses a little quickly so that I could post before the work day started. I'll fix this now. And nice solution btw :)

2 votes
1. onyxleopard
  May 24, 2019
  Link Parent
  Thanks. I always like to try doing a problem myself before posting it—even things that seem simple can be harder than you think if you’ve never done them before. Text-encoding is always...
  
  And nice solution btw :)
  
  Thanks. I always like to try doing a problem myself before posting it—even things that seem simple can be harder than you think if you’ve never done them before. Text-encoding is always harder/hairier than it should be.
  
  2 votes

[2]

Soptik

May 25, 2019 (edited May 25, 2019)

Link

That's really nice, thanks for this challenge. I started learning Rust, so this is great opportunity to try it out! use std::io::Read; use std::collections::HashMap; use std::io; fn main() { let...

That's really nice, thanks for this challenge. I started learning Rust, so this is great opportunity to try it out!

use std::io::Read;
use std::collections::HashMap;
use std::io;

fn main() {
    let mut stdin = io::stdin();

    let mut letters_encoding: HashMap<char, &str> = HashMap::with_capacity(28);
    if let Err(()) = fill_letters_map(&mut letters_encoding) {
        eprintln!("Failed to create letter map.");
        return;
    }

    let mut message: String;
    let mut message_buffer: Vec<u8> = Vec::new();
    if let Ok(_) = stdin.read_to_end(&mut message_buffer) {
        message = String::from_utf8(message_buffer).unwrap();
        message = message.trim().to_ascii_lowercase();
        let encoded_in_string = create_bytes(&message, &letters_encoding);
        println!("{}", encoded_in_string.join(" "));
        let byte_numbers = convert_bytes_to_u8(&encoded_in_string);
        let hex_bytes: Vec<String> = byte_numbers.iter().map(|num| format!("{:02X}", num)).collect();
        let utf8_string = String::from_utf8_lossy(&byte_numbers);
        println!("{}", hex_bytes.join(" "));
        println!("{}", utf8_string);
        let saved_bytes = message.len() - encoded_in_string.len();
        println!(
            "Saved {} bytes ({}%)",
            saved_bytes,
            (encoded_in_string.len() as f32) * 100. / (message.len() as f32)
        );
    } else {
        eprintln!("Failed to get message from stdin.");
    }
}

/// Fill letters encoding/decoding map
fn fill_letters_map(letters_encoding: &mut HashMap<char, &str>) -> Result<(), ()> {
    let keys: Vec<(usize, char)> = " tnsrd~cmgbvkqeoaihlufpwyjxz".char_indices().collect();
    let values = "11,1001,10000,0101,01000,00101,001000,000101,000011,0000100,0000010,00000001,0000000001,000000000001,101,10001,011,01001,0011,001001,00011,000100,0000101,0000011,0000001,000000001,00000000001,000000000000";
    let values: Vec<&str> = values.split(',').collect();
    if keys.len() != values.len() {
        return Err(());
    }
    let mut idx: usize = 0;
    for (_, key) in keys {
        letters_encoding.insert(key, values[idx]);
        idx += 1;
    }
    Ok(())
}

/// Convert message to bytes according to encoding/decoding map and split it into
/// bytes (in string).
fn create_bytes<'a>(message: &'a str, letters_encoding: &HashMap<char, &'a str>) -> Vec<String> {
    let mut result = String::new();
    for chr in message.chars() {
        if !letters_encoding.contains_key(&chr) {
            eprintln!("Invalid character, skipping: {}", &chr);
        } else {
            result += &letters_encoding[&chr];
        }
    }

    let number_of_batches = result.len() / 8 + if result.len() % 8 > 0 { 1 } else { 0 };

    let mut result_splitted: Vec<String> = Vec::with_capacity(number_of_batches);
    for i in 0..number_of_batches {
        let letters_in_this_batch = std::cmp::min(result.len() - i * 8, 8);
        let zeros_to_fill = 8 - letters_in_this_batch;
        let max_index = std::cmp::min(i * 8 + 8, result.len());
        let st = "0".repeat(zeros_to_fill) + &result[i * 8..max_index];
        result_splitted.push(st);
    }

    result_splitted
}

/// Take vector of bytes and return vector of numbers (u8)
fn convert_bytes_to_u8(bytes: &Vec<String>) -> Vec<u8> {
    let mut result: Vec<u8> = Vec::with_capacity(bytes.len());

    for byte in bytes {
        if let Ok(hex) = bin_string_to_u8(&byte) {
            result.push(hex);
        } else {
            eprintln!("Failed to convert to hex: {}", byte);
            result.push(0);
        }
    }

    result
}

/// Convert binary string to u8. Example: 101 => 5
fn bin_string_to_u8(bin: &str) -> Result<u8, ()> {
    let mut result: u8 = 0;
    let mut power: u32 = 0;
    for (_, letter) in bin.char_indices().rev() {
        match letter {
            '0' => {}
            '1' => {
                result += (2 as u8).pow(power);
            }
            _ => {
                return Err(());
            }
        }
        power += 1;
    }

    Ok(result)
}

I still don't have the decoding bonus, but maybe I'll finish it later.

Out of curiosity, I piped The Sorrows of Young Werther (without unsupported characters, which took it from 251K to 239K) into this encoder and it saved 115577 bytes (52.707966%). I'm surprised how good this encoding is, @gpl!

3 votes

onyxleopard

May 25, 2019

Link Parent

This encoding is taking into account relative frequency of the English alphabet in English (e.g. common words like ’the’ and 'a' get compressed rather well—uncommon words like 'syzygy' don’t. $...

This encoding is taking into account relative frequency of the English alphabet in English (e.g. common words like ’the’ and 'a' get compressed rather well—uncommon words like 'syzygy' don’t.

$ echo 'the' | ./codec.py - -v
bin bytes: 10010011 10100000
hex bytes: 93 a0
unicode bytes:  
compression ratio: 0.6666666666666666
D93a0
$ echo 'syzygy' | ./codec.py - -v
bin bytes: 01010000 00100000 00000000 00000100 00100000 00010000
hex bytes: 50 20 00 04 20 10
unicode bytes: P  
compression ratio: 1.0
D502000042010
$ echo 'if you use common words its good' | ./codec.py - -v
bin bytes: 01001000 10011000 00011000 10001111 00011010 11011100 01011000 10000110 00011100 01100001 10000011 10001010 00001010 10111010 01100101 01110000 10010001 10001001 01000000
hex bytes: 48 98 18 8f 1a dc 58 86 1c 61 83 8a 0a ba 65 70 91 89 40
unicode bytes: HÜXa
ºep@
compression ratio: 0.59375
D4898188f1adc58861c61838a0aba6570918940
$ echo 'hapax legomenon syzygy qwerty' | ./codec.py - -v
bin bytes: 00110110 00010101 10000000 00011100 10011010 00010010 00100001 11011000 01000110 00011010 10000001 00000000 00000000 00100001 00000000 11100000 00000010 00001110 10100010 01000000 10000000
hex bytes: 36 15 80 1c 9a 12 21 d8 46 1a 81 00 00 21 00 e0 02 0e a2 40 80
unicode bytes: 6!ØF!à¢@
compression ratio: 0.7241379310344828
D3615801c9a1221d8461a8100002100e0020ea24080

Also, if I’m reading your code correctly (I’m not fluent in Rust), it looks like in your create_bytes function you’re throwing away invalid characters. Lossy compression is fine and all, but for text, it’s probably not going to be excusable if you decompress a novel and all the punctuation is missing ;P.

2 votes

Hvv

May 25, 2019

Link

This program uses the default output format in C++. This has the side effect that the program will happily print out complete garbage every time it encodes a line. I did something special-ish for...

This program uses the default output format in C++. This has the side effect that the program will happily print out complete garbage every time it encodes a line.

I did something special-ish for decoding by using a trie to deduce letters from the input, but other than that (and abusing the char to int correspondence), it's mostly normal stuff.

Does 3(?)/3 bonuses

C++

#include <bits/stdc++.h>
using namespace std;
unordered_map<char,string> mp;
int ctr = 1;
int tr[100][2];
char cha[100];
bool oc[100];
string res;
const char* init[26] = {
	"011","0000010","000101","00101","101",
	"000100","0000100","0011","01001","000000001",
	"0000000001","001001","000011","10000","10001",
	"0000101","000000000001","01000","0101","1001",
	"00011","00000001","0000011","00000000001","0000001",
	"000000000000"};
void insert(int ch, string val) {
	mp.insert({char(ch),val});
	int st = 0;
	int pos = 0;
	while(pos < val.size()) {
		bool id = val[pos] == '1';
		if(tr[st][id] == -1) {
			tr[st][id] = ctr++;
		}
		st = tr[st][id];
		pos++;
	}
	oc[st] = true;
	cha[st] = ch;
}
string bth(string s) {
	string res(s.size()/4,0);
	for(int i=0;i<s.size()/4;i++) {
		int hex = 0;
		for(int j=0;j<4;j++) {
			hex *= 2;
			hex += s[i*4+j]-'0';
		}
		res[i] = (hex>=10?(hex-10)+'A':hex+'0');
	}
	return res;
}
string dec(string s) {
	vector<int> dc(s.size()*8-8,0);
	for(int i=1;i<s.size();i++) {
		int c = s[i];
		if(c < 0) {c += 256;}
		for(int j=0;j<8;j++) {
			dc[i*8-1-j] = c%2;
			c /= 2;
		}
	}
	int st = 0;
	int pos = 0;
	string res;
	while(pos < dc.size()) {
		int id = dc[pos];
		st = tr[st][id];
		if(oc[st]) {
			res.push_back(cha[st]);
			st = 0;
		}
		pos++;
	}
	return res;
}
string enc(string s) {
	string res;
	for(int i=0;i<s.size();i++) {
		if(mp.find(s[i]) != mp.end()) res.append(mp[s[i]]);
		else return "-";
	}
	int pad = (8-res.size()%8)%8;
	res.append(pad,'0');
	return res;
}
string asc(string s) {
	string res(s.size()/8,'\0');
	for(int i=0;i<s.size()/8;i++) {
		int ch = 0;
		for(int j=0;j<8;j++) {
			ch *= 2;
			ch += s[i*8+j]-'0';
		}
		res[i] = ch;
	}
	return res;
}
string pad(string s, int n) {
	string res(s.size()+(s.size()/n-1),'\0');
	int pos = 0;
	for(int i=0;i<s.size();i++) {
		res[pos++] = s[i];
		if(i > 0 && !((i+1)%n)) {res[pos++] = ' ';}
	}
	return res;
}
int main() {
	memset(tr,-1,sizeof(tr));
	memset(cha,-1,sizeof(cha));
	memset(oc,0,sizeof(oc));
	for(int i=0;i<26;i++) {
		insert(i+'a',init[i]);
	}
	insert('~',"001000");
	insert(' ',"11");
	while(1) {
		string in;
		getline(cin,in);
		if(in.size() == 0) break;
		if(in[0] == 'D') {
			cout << dec(in) << '\n';
		} else {
			string res = enc(in);
			if(res[0] == '-') {
				cout << "Invalid input\n";continue;
			}
			cout << pad(res,8) << '\n';
			cout << pad(bth(res),2) << '\n';
			cout << asc(res) << '\n';
			double ratio = double(res.size())/(in.size()*8);
			cout << "Compression ratio " << ratio << '\n';
		}
	}
}

2 votes