#include <xmtc.h>
#include <xmtio.h>

#define ASSERT(cond) if (!cond) printf("ASSERTION FAILED: %s\n", #cond)

int curtime = 0;

//#define DEBUG 1

#ifdef __XMTC_2_OPENMP__
void dump_(int *arr, int len, const char *path) {
	FILE *f = fopen(path, "w");
	for (int i = 0; i < len; i++)
		fprintf(f, "%.8x\n", arr[i]);
	fclose(f);
}
void dumpb_(int *arr, int len, const char *path) {
	FILE *f = fopen(path, "w");
	fwrite(arr, len, sizeof(int), f);
	fclose(f);
}
#define dump(arr, path) dump_(arr, sizeof(arr)/sizeof(int), path)
#define dumpb(arr, path) dumpb_(arr, sizeof(arr)/sizeof(int), path)
#else
#define dump(arr, path)
#define dumpb(arr, path)
#endif

#define USE_KSPAWN
#include "kspawn.h"
#include "blocksort.h"
#include "profile.h"
#include "lib/arith.h"
#include "lib/sum.h"
#include "lib/kheap.h"
#include "lib/bytestream.h"
#include "lib/bstream.h"
#include "lib/listrank.h"
#include "lib/fi.h"
#include "lib/shift.h"

#ifndef UCHAR_MAX
#define UCHAR_MAX 255
#endif

#define ALPHABET_SIZE (UCHAR_MAX + 1)
#define BYTES_PER_SELECTOR 50

#define SWAP3(w, x, y) do { w = x; x = y; y = w; } while (0)
#define SWAP(x, y) do { typeof(x) temp; SWAP3(temp, x, y); } while (0)
#define WRAP(i, n) ((i) >= (n) ? (i) - (n) : (i))
#define WRAPR(i, n) ((i) < 0 ? (i) + (n) : (i))

//#define INPUT "banana"
//#define INPUT "abraca"
//#define N (sizeof(INPUT) - 1)
//const char *input = INPUT;

#define INPUT_PTR ((const char *)input)

typedef int huffman_node;
#define NODE_T_BIT 0x200
#define MK_NODE(i0, i1) ((i0) << 10 | (i1))
#define NODE_T0(x) ((x) >> 19 & 0x1  )
#define NODE_C0(x) ((x) >> 10 & 0x1ff)
#define NODE_0(x)  ((x) >> 10 & 0x3ff)
#define NODE_T1(x) ((x) >>  9 & 0x1  )
#define NODE_C1(x) ((x)       & 0x1ff)
#define NODE_1(x)  ((x)       & 0x3ff)
#define NODE_CHANGE(x, i, y) ((i) & 1 ? MK_NODE(NODE_0(x), y) : MK_NODE(y, NODE_1(x)))
#define NODE_SET(x, i, y) x = NODE_CHANGE(x, i, y);

#define RLE_EXPAND 5/4
int bwt[N * RLE_EXPAND];
int mtf_stream[N * RLE_EXPAND + 1];
int huffman_stream[2*N + 100];
int huffman_stream_len;

int ihuf_stream[N * RLE_EXPAND];
int imtf_stream[N * RLE_EXPAND];
int ibwt_stream[N * RLE_EXPAND];
int ibwt_stream_pack[CDIV(N * RLE_EXPAND, 4)];
int round_trip[CDIV(N, 4)];

#define K 20

int *memset_int(int *s, int c, int n) {
	begin_kspawn(0, n, K) {
		s[$$] = c;
	} end_kspawn
	return s;
}

#ifdef __XMTC_2_OPENMP__
#include <string.h>
void print_string(const char *s) {
	char buf[701];
	int i;
	int bi = 0;
	for (i = 0; i < 70 && s[i] != '\0'; i++) {
		switch (s[i]) {
		case '\n':
			strcpy(buf + bi, "\x1b[7m\\n\x1b[0m");
			bi += 10;
			break;
		case '\t':
			strcpy(buf + bi, "\x1b[7m\\t\x1b[0m");
			bi += 10;
			break;
		default:
			buf[bi++] = s[i];
			break;
		}
	}
	buf[bi] = '\0';
	printf("%s\n", buf);
}
#endif

int __inline__ get_word_byte(int word, int i) {
	SRLV(word, word, i * 8);
	return word & 0xff;
}

#define BYTES_TO_WORD(v) v##3 << 24 | v##2 << 16 | v##1 << 8 | v##0

#define ALIGN_DOWN(n, block) ((n) & ~((block) - 1))
#define ALIGN_UP(n, block) ALIGN_DOWN((n) + (block) - 1, block)
#define BITMASK(begin, length) (((1 << (length)) - 1) << (begin))

#define RLE_CHUNKSIZE 256

int rle_encode(int *input, int *output, int n) {
	STARTTIME();

	int n_chunks = CDIV(n, RLE_CHUNKSIZE);

	// Find the position of each byte in its run
	int runpos[n_chunks + 1];
	spawn(0, n_chunks - 1) {
		int begin = $ * RLE_CHUNKSIZE;
		int end = begin + RLE_CHUNKSIZE;
		if (end > n)
			end = n;
		int end_align = ALIGN_DOWN(end, 4);

		bytestream_t ins;
		BYTE_STREAM_INIT_READ(ins, input, begin);

		int b0 = (begin > 0) ? get_word_byte(input[begin / 4 - 1], 3) : -1;
		int running_total = FI(0, 1);
		int i;
		for (i = begin; i < end_align; i += 4) {
			int b1, b2, b3, b4;
			BYTE_STREAM_READ4(ins, b1, b2, b3, b4);

			if (b4 != b3)
				running_total = 1;
			else if (b3 != b2)
				running_total = 2;
			else if (b2 != b1)
				running_total = 3;
			else if (b1 != b0)
				running_total = 4;
			else
				running_total += 4;

			b0 = b4;
		}

		for ( ; i < end; i++) {
			int b1;
			BYTE_STREAM_READ(ins, b1);

			if (b0 == b1)
				running_total++;
			else
				running_total = 1;

			b0 = b1;
		}

		runpos[$ + 1] = running_total;
	}
	SHOWTIME("local_count");

	runpos[0] = 1;
	prefix_sum_flagged_int(runpos + 1, runpos + 1, n_chunks);
	SHOWTIME("global_count");

	// Reserve a space for each byte (up to four per run)
	// and for each count (one per run of >= 4) in the output
	int outpos[n_chunks + 1];
	spawn(0, n_chunks - 1) {
		int begin = $ * RLE_CHUNKSIZE;
		int end = begin + RLE_CHUNKSIZE;
		if (end > n)
			end = n;

		bytestream_t ins;
		BYTE_STREAM_INIT_READ(ins, input, begin);

		int b0 = (begin > 0) ? get_word_byte(input[begin / 4 - 1], 3) : -1;
		int running_total = (runpos[$] + 255 - 1) % 255 + 1;
		int block_out = 0;
		int i;
		for (i = begin; i < end; i++) {
			int b1;
			BYTE_STREAM_READ(ins, b1);

			if (b0 == b1) {
				if (running_total == 255) {
					block_out++;
					running_total = 0;
				}
				running_total++;
				if (running_total <= 4)
					block_out++;
			}
			else {
				if (running_total >= 4)
					block_out++;
				running_total = 1;
				block_out++;
			}

			b0 = b1;
		}
		if (end == n && running_total >= 4)
			block_out++;

		outpos[$ + 1] = block_out;
	}
	SHOWTIME("local_locate");

	outpos[0] = 0;
	prefix_sum_int(outpos + 1, outpos + 1, n_chunks);
	SHOWTIME("global_locate");

	// Write counts and bytes to output
	spawn(0, n_chunks - 1) {
		int begin = $ * RLE_CHUNKSIZE;
		int end = begin + RLE_CHUNKSIZE;
		if (end > n)
			end = n;

		bytestream_t ins, outs;
		BYTE_STREAM_INIT_READ(ins, input, begin);
		BYTE_STREAM_INIT_WRITE(outs, output, outpos[$]);

		int b0 = (begin > 0) ? get_word_byte(input[begin / 4 - 1], 3) : -1;
		int running_total = (runpos[$] + 255 - 1) % 255 + 1;
		int i;
		for (i = begin; i < end; i++) {
			int b1;
			BYTE_STREAM_READ(ins, b1);

			if (b0 == b1) {
				if (running_total == 255) {
					BYTE_STREAM_WRITE(outs, running_total - 4);
					running_total = 0;
				}
				running_total++;
				if (running_total <= 4)
					BYTE_STREAM_WRITE(outs, b1);
			}
			else {
				if (running_total >= 4)
					BYTE_STREAM_WRITE(outs, running_total - 4);
				running_total = 1;
				BYTE_STREAM_WRITE(outs, b1);
			}

			b0 = b1;
		}
		if (end == n && running_total >= 4)
			BYTE_STREAM_WRITE(outs, running_total - 4);
		BYTE_STREAM_FLUSH(outs);
	}
	SHOWTIME("output");

	#if DEBUG >= 1
	for (int i = 0; i <= n_chunks; i++)
		printf("runpos[%d] == %d\n", i, runpos[i]);
	for (int i = 0; i <= n_chunks; i++)
		printf("outpos[%d] == %d\n", i, outpos[i]);
	#endif

	return outpos[n_chunks];
}

int rle_decode(int *input, int *output, int n) {
	STARTTIME();

	int n_chunks = CDIV(n, RLE_CHUNKSIZE);

	// Find the position of each byte in its run.
	// This is necessary to handle the case where there is a long run of
	// (char)251 in the input, which is encoded as 4 instances of (char)251
	// followed by another (char)251 for the run length, etc.
	int runpos[n_chunks + 1];
	spawn(0, n_chunks - 1) {
		int begin = $ * RLE_CHUNKSIZE;
		int end = begin + RLE_CHUNKSIZE;
		if (end > n)
			end = n;
		int end_align = ALIGN_DOWN(end, 4);

		int b0 = (begin > 0) ? input[begin - 1] : -1;
		int running_total = FI(0, 1);
		int i;

		for (i = begin; i < end; i++) {
			int b1 = input[i];

			if (b0 == b1)
				running_total++;
			else
				running_total = 1;

			b0 = b1;
		}

		runpos[$ + 1] = running_total;
	}
	SHOWTIME("local_count");

	runpos[0] = 1;
	prefix_sum_flagged_int(runpos + 1, runpos + 1, n_chunks);
	SHOWTIME("global_count");

	// Reserve a space for each regular character and x spaces for each
	// run-length character with value x
	int outpos[n_chunks + 1];
	spawn(0, n_chunks - 1) {
		int begin = $ * RLE_CHUNKSIZE;
		int end = begin + RLE_CHUNKSIZE;
		if (end > n)
			end = n;

		int b0 = (begin > 0) ? input[begin - 1] : -1;
		int running_total = runpos[$] % 5;
		int block_out = 0;
		int i;
		for (i = begin; i < end; i++) {
			int b1 = input[i];

			if (running_total == 4) {
				block_out += b1;
				running_total = 0;
				b0 = -1;
			}
			else {
				block_out++;
				if (b0 == b1) {
					running_total++;
				}
				else {
					b0 = b1;
					running_total = 1;
				}
			}
		}

		outpos[$ + 1] = block_out;
	}
	SHOWTIME("local_locate");

	outpos[0] = 0;
	prefix_sum_int(outpos + 1, outpos + 1, n_chunks);
	SHOWTIME("global_locate");

	// Write output
	spawn(0, n_chunks - 1) {
		int begin = $ * RLE_CHUNKSIZE;
		int end = begin + RLE_CHUNKSIZE;
		if (end > n)
			end = n;

		bytestream_t outs;
		BYTE_STREAM_INIT_WRITE(outs, output, outpos[$]);

		int b0 = (begin > 0) ? input[begin - 1] : -1;
		int running_total = runpos[$] % 5;
		int i;
		for (i = begin; i < end; i++) {
			int b1 = input[i];

			if (running_total == 4) {
				int j;
				for (j = 0; j < b1; j++)
					BYTE_STREAM_WRITE(outs, b0);
				running_total = 0;
				b0 = -1;
			}
			else {
				BYTE_STREAM_WRITE(outs, b1);
				if (b0 == b1) {
					running_total++;
				}
				else {
					b0 = b1;
					running_total = 1;
				}
			}
		}

		BYTE_STREAM_FLUSH(outs);
	}
	SHOWTIME("output");

	return outpos[n_chunks];
}

#define PCLUS 32
#define WORD_AT_BYTE_OFFSET(a, i) (*(int *)((char *)a + i))
void dopack(int *input, int *output, int n) {
	spawn(0, CDIV(n, PCLUS) - 1) {
		int begin = $ * PCLUS;
		int end = begin + PCLUS;
		if (end > n)
			end = n;
		int end_align = ALIGN_DOWN(end, 4);
		int i;
		for (i = begin; i < end_align; i += 4) {
			int b0 = input[i];
			int b1 = input[i + 1];
			int b2 = input[i + 2];
			int b3 = input[i + 3];
			WORD_AT_BYTE_OFFSET(output, i) = BYTES_TO_WORD(b);
		}
		int extra = end - end_align;
		if (extra > 0) {
			int word = input[i];
			if (extra > 1) {
				word |= input[i + 1] << 8;
				if (extra > 2)
					word |= input[i + 2] << 16;
			}
			WORD_AT_BYTE_OFFSET(output, i) = word;
		}
	}
}

void ibwt(int *input, int index, int *output, int n) {
	STARTTIME();
	int block_size = MAX(n / 1024, 1);
	int threads = CDIV(n, block_size);

	int p[n];
	int c[ALPHABET_SIZE * threads + 1];
	memset_int(c, 0, ALPHABET_SIZE * threads + 1);
	SHOWTIME("init");

	spawn(0, threads - 1) {
		int lb = $ * block_size;
		int ub = lb + block_size;
		int i;
		if (ub > n)
			ub = n;
		for (i = lb; i < ub; i++)
			p[i] = c[input[i] * threads + $ + 1]++;
	}
	SHOWTIME("count");

	prefix_sum_int(c + 1, c + 1, ALPHABET_SIZE * threads);
	SHOWTIME("psum");

	int next[n];
	int rank[n];
	begin_kspawn(0, n, K) {
		int j = p[$$] + c[input[$$] * threads + $$ / block_size];
		if (j == index) {
			next[$$] = $$;
			rank[$$] = 0;
		}
		else {
			next[$$] = j;
			rank[$$] = 1;
		}
	} end_kspawn
	SHOWTIME("setuprank");

	dump(next, "next.txt");
	dump(rank, "rank.txt");
	list_rank(next, rank, n);
	SHOWTIME("rank");

	begin_kspawn(0, n, K) {
		output[rank[$$]] = input[$$];
	} end_kspawn
	SHOWTIME("permute");
}

int mtf_encode(const int *input, int *output, int n, int *used_chars, int *n_used_chars);
int mtf_decode(const int *input, int *output, int n, int *used_chars);

#define FREQ_COUNT_SPREAD 128
void freq_count(const int *input, int n, int *freq, int n_charset) {
	int freq_spread[n_charset * FREQ_COUNT_SPREAD];

	begin_kspawn(0, n_charset * FREQ_COUNT_SPREAD, K) {
		freq_spread[$$] = 0;
	} end_kspawn

	begin_kspawn(0, n, K) {
		int i = 1;
		psm(i, freq_spread[input[$$] + $$ % FREQ_COUNT_SPREAD * n_charset]);
	} end_kspawn

	spawn(0, n_charset - 1) {
		freq[$] = freq_spread[$];
		for (int i = 1; i < FREQ_COUNT_SPREAD; i++)
			freq[$] += freq_spread[$ + i * n_charset];
	}
}

void build_huffman_tree(const int *freq, int n_freq, huffman_node *tree) {
	#ifdef DEBUG
	int i;
	for (i = 0; i < n_freq; i++) {
		if (freq[i] > 0)
			printf("%d:%d ", i, freq[i]);
	}
	printf("\n");
	#endif

	heap_t heap;
	heap.size = 0;

	for (int i = 0; i < n_freq; i++)
		heap_insert(&heap, freq[i], i);

	int i = 0;
	int c1, f1, c2, f2;
	while (heap_remove(&heap, &f1, &c1) && heap_remove(&heap, &f2, &c2)) {
		tree[i] = MK_NODE(c1, c2);
		heap_insert(&heap, f1 + f2, NODE_T_BIT | i);
		i++;
	}
}

void huffman_tree_to_counts_recursive(const huffman_node *tree, int *lengths, int i, int curlen, int *minlen, int *maxlen) {
	huffman_node node = tree[i];

	curlen++;
	if (NODE_T0(node))
		huffman_tree_to_counts_recursive(tree, lengths, NODE_C0(node), curlen, minlen, maxlen);
	else {
		lengths[NODE_C0(node)] = curlen;
		if (curlen < *minlen)
			*minlen = curlen;
		if (curlen > *maxlen)
			*maxlen = curlen;
	}

	if (NODE_T1(node))
		huffman_tree_to_counts_recursive(tree, lengths, NODE_C1(node), curlen, minlen, maxlen);
	else {
		lengths[NODE_C1(node)] = curlen;
		if (curlen < *minlen)
			*minlen = curlen;
		if (curlen > *maxlen)
			*maxlen = curlen;
	}
}

void build_huffman_lengths(const int *freq, int n_freq, int *lengths, int *minlen, int *maxlen) {
	huffman_node tree[n_freq];
	build_huffman_tree(freq, n_freq, tree);
	*minlen = 1000;
	*maxlen = 0;
	huffman_tree_to_counts_recursive(tree, lengths, n_freq - 2, 0, minlen, maxlen);
}

typedef struct {
	int size;
	//union {
		int data;
		//int *ptr;
	//}
} huffman_table_entry;

void huffman_lengths_to_table(const int *lengths, int n_freq, int minlen, int maxlen, huffman_table_entry *table) {
	int code = 0;
	int len;

	for (len = minlen; len <= maxlen; len++) {
		int c;
		for (c = 0; c < n_freq; c++) {
			if (lengths[c] == len) {
				table[c].size = len;
				table[c].data = code;
				code++;
			}
		}
		code <<= 1;
	}
}

void huffman_lengths_to_tree(const int *lengths, int n_freq, int minlen, int maxlen, huffman_node *tree) {
	int i, len;
	int level_begin = 0;
	int level_end = 2;

	for (len = 1; len < minlen; len++) {
		int newi = level_end >> 1;
		for (i = level_begin; i < level_end; i++) {
			int idx = newi++;
			NODE_SET(tree[i >> 1], i & 1, NODE_T_BIT | idx);
		}
		level_begin = level_end;
		level_end = newi << 1;
	}

	for ( ; len <= maxlen; len++) {
		int newi = level_end >> 1;
		i = level_begin;

		int c;
		for (c = 0; c < n_freq; c++) {
			if (lengths[c] == len) {
				NODE_SET(tree[i >> 1], i & 1, c);
				i++;
			}
		}

		for ( ; i < level_end; i++) {
			int idx = newi++;
			NODE_SET(tree[i >> 1], i & 1, NODE_T_BIT | idx);
		}
		level_begin = level_end;
		level_end = newi << 1;
	}
}

void show_table(const huffman_table_entry *table, int n_entries) {
	int i;
	for (i = 0; i < n_entries; i++) {
		printf("%d: ", i);
		int j;
		for (j = table[i].size - 1; j >= 0; j--)
			printf("%d", table[i].data & 1 << j ? 1 : 0);
		printf("\n");
	}
}

void show_tree(const huffman_node *tree, int root, int depth) {
	huffman_node node = tree[root];

	int i;
	printf("0");
	if (NODE_T0(node))
		show_tree(tree, NODE_C0(node), depth + 1);
	else
		printf("(%d)\n", NODE_C0(node));

	for (i = 0; i < depth; i++)
		printf(" ");
	printf("1");
	if (NODE_T1(node))
		show_tree(tree, NODE_C1(node), depth + 1);
	else
		printf("(%d)\n", NODE_C1(node));
}

void show_bitstream(int *data, int begin, int len) {
	bstream_t istr;
	BSTREAM_INIT_READ(istr, data, begin);
	while (BSTREAM_TELL(istr) < begin + len) {
		int bit;
		BSTREAM_READ_BIT(istr, bit);
		printf("%d", bit);
	}
	printf("\n");
}

void huffman_encode_block(const int *input, int n, const huffman_table_entry *table, int *output, int output_pos) {
	int i;
	bstream_t ostr;
	BSTREAM_INIT_WRITE(ostr, output, output_pos);
	for (i = 0; i < n; i++) {
		const huffman_table_entry *entry = &table[input[i]];
		BSTREAM_WRITE(ostr, entry->data, entry->size);
	}
	BSTREAM_FLUSH(ostr);
}

int huffman_encode(const int *input, int n, const huffman_table_entry *table, int *output, int output_pos) {
	STARTTIME();

	int k = 128;
	int nk = CDIV(n, k);

	int bit_pos[nk + 1];
	bit_pos[0] = 0;
	spawn(0, nk - 1) {
		int min = $ * k;
		int max = min + k;
		if (max > n)
			max = n;

		int i;
		int count = 0;
		for (i = min; i < max; i++) {
			int s;
			loadROBuffer2(s, table[input[i]].size);
			count += s;
		}
		bit_pos[$ + 1] = count;
	}
	SHOWTIME("init_bit_pos");
	prefix_sum_int(bit_pos + 1, bit_pos + 1, nk);
	SHOWTIME("psum_bit_pos");

	// Encode blocks
	spawn(0, nk - 1) {
		int min = $ * k;
		int max = min + k;
		if (max > n)
			max = n;

		huffman_encode_block(input + min, max - min, table, output, bit_pos[$] + output_pos);
	}
	SHOWTIME("encode");

	return bit_pos[nk];
}

// int huffman_decode_char(bstream_t *istr, int istr_end, const huffman_node *tree) {
#define HUFFMAN_DECODE_CHAR(istr, istr_end, tree, ch) do { \
	int _node_index = 0; \
	ch = -1; \
	while (BSTREAM_TELL(istr) < istr_end) { \
		int _bit; \
		BSTREAM_READ_BIT(istr, _bit); \
		\
		int _t; \
		int _c; \
		huffman_node _node; \
		loadROBuffer2(_node, tree[_node_index]); \
		/* Required for MTCU */ \
		/*huffman_node _node = tree[_node_index];*/ \
		if (_bit) { \
			_t = NODE_T1(_node); \
			_c = NODE_C1(_node); \
		} \
		else { \
			_t = NODE_T0(_node); \
			_c = NODE_C0(_node); \
		} \
		\
		if (_t) \
			_node_index = _c; \
		else { \
			ch = _c; \
			break; \
		} \
	} \
} while (0)

int huffman_decode_block(int *input, int begin, int end, const huffman_node *tree, int *output, int eos_char) {
	int output_ptr = 0;
	bstream_t istr;
	BSTREAM_INIT_READ(istr, input, begin);
	while (1) {
		int c;
		HUFFMAN_DECODE_CHAR(istr, end, tree, c);
		if (c == -1 || c == eos_char)
			return output_ptr;

		output[output_ptr] = c;
		output_ptr++;
	}
}

#if 0  // serial
int huffman_decode(int *input, int begin, int end, const huffman_node *tree, int *output, int eos_char) {
	// NOTE: To use this, replace loadROBuffer2 with an assignment statement
	return huffman_decode_block(input, begin, end, tree, output, eos_char);
}
#else  // block parallel
typedef int next_ptr_t[32];
int huffman_decode(int *input, int begin, int end, const huffman_node *tree, int *output, int eos_char) {
	STARTTIME();
	int blocksize = 256;
	int l = 32;
	int n = end - begin;
	int nt = CDIV(n, blocksize);
	next_ptr_t seg_next[2*nt];

	spawn(0, nt - 2) {
		int blockbegin = $ * blocksize + begin;
		int blockend = blockbegin + blocksize;
		int i;

		for (i = l - 1; i >= 0; i--) {
			bstream_t istr;
			BSTREAM_INIT_READ(istr, input, blockbegin + i);
			int next = -1;
			while (BSTREAM_TELL(istr) < blockend) {
				int ch;
				HUFFMAN_DECODE_CHAR(istr, end, tree, ch);
				if (ch == -1 || ch == eos_char) {
					next = 0;
					break;
				}
				else if (BSTREAM_TELL(istr) - blockbegin < l) {
					next = seg_next[$][BSTREAM_TELL(istr) - blockbegin];
					break;
				}

			}
			if (next == -1)
				next = BSTREAM_TELL(istr) - blockend;
			seg_next[$][i] = next;
		}
	}

	SHOWTIME("init_seg");

	int levels = ceil_log2(nt - 1);
	int level_idx[levels + 1];
	level_idx[0] = 0;
	level_idx[1] = nt - 1;
	int h = 1;
	while (1) {
		int cur_base = level_idx[h - 1];
		int next_base = level_idx[h];
		int count = next_base - cur_base;
		int next_count = count / 2;
		if (next_count < 1)
			break;
		spawn(0, next_count - 1) {
			int i;
			for (i = 0; i < 32; i++)
				seg_next[next_base + $][i] = seg_next[cur_base + 2*$ + 1][seg_next[cur_base + 2*$][i]];
		}
		h++;
		level_idx[h] = next_base + next_count;
	}

	SHOWTIME("psum_seg_up");
	
	int start_pos[nt];
	int distance = 1 << h;
	start_pos[0] = 0;
	while (distance > 1) {
		int cur_base = level_idx[h - 1];
		int next_base = level_idx[h];
		int count = next_base - cur_base;
		int neighbor = distance / 2;
		spawn(0, CDIV(count, 2) - 1) {
			start_pos[$ * distance + neighbor] = seg_next[cur_base + 2*$][start_pos[$ * distance]];
		}
		h--;
		distance = neighbor;
	}
	
	SHOWTIME("psum_seg_down");

	int opos[nt + 1];
	spawn(0, nt - 1) {
		int i = $ * blocksize + begin + start_pos[$];
		int j = ($ == nt - 1) ? end : ($ + 1) * blocksize + begin + start_pos[$ + 1];
		int output_ptr = 0;
		bstream_t istr;
		BSTREAM_INIT_READ(istr, input, i);
		while (1) {
			int c;
			HUFFMAN_DECODE_CHAR(istr, j, tree, c);
			if (c == -1 || c == eos_char)
				break;
			output_ptr++;
		}
		opos[$ + 1] = output_ptr;
	}
	SHOWTIME("count_out");

	opos[0] = 0;
	prefix_sum_int(opos + 1, opos + 1, nt);
	SHOWTIME("psum_out");

	spawn(0, nt - 1) {
		int i = $ * blocksize + begin + start_pos[$];
		int j = ($ == nt - 1) ? end : ($ + 1) * blocksize + begin + start_pos[$ + 1];
		huffman_decode_block(input, i, j, tree, output + opos[$], eos_char);
	}
	SHOWTIME("decode");

	return opos[nt];
}
#endif

void printi_escaped(const int *s, int n) {
	for (int i = 0; i < n; i++) {
		unsigned int c = s[i];
		if (c < 32 || c >= 127)
			printf("\033[7m%.2x\033[m", c);
		else
			printf("%c", c);
	}
}

void printi(const int *s, int n) {
	int i;
	printf("(");
	for (i = 0; i < n; i++)
		printf(i == 0 ? "%d" : ", %d", s[i]);
	printf(")");
}

void print_escaped(const char *s, int n) {
	for (int i = 0; i < n; i++) {
		unsigned char c = s[i];
		if (c < 32 || c >= 127)
			printf("\033[7m%.2x\033[m", c);
		else
			printf("%c", c);
	}
}

void do_compress() {
	int start, end;

//#define DECOMPRESS_ONLY
#ifdef DECOMPRESS_ONLY
	#define huff_tree huftree
	#define huffman_stream hufstream
	huffman_stream_len = HUFSTREAM_BITS;
	int index = BWT_INDEX;
#else
	#ifdef DEBUG
	printf("orig: '");
	print_escaped(input, N);
	printf("'\n");
	#endif

	// Write file header
	int i, j;
	bstream_t ostr;
	BSTREAM_INIT_WRITE(ostr, huffman_stream, 0);
	BSTREAM_WRITE(ostr, 0x425a, 16); // magic
	BSTREAM_WRITE(ostr, 0x68, 8); // version
	BSTREAM_WRITE(ostr, 0x30 + 9, 8); // hundred_k_blocksize

	xmt_readtimer32(start);
	int rle[N * RLE_EXPAND];
	int rle_len = rle_encode(input, rle, N);
	//xmt_readtimer32(end);
	//printf(" RLE cycles: %d\n", end - start);
	#ifdef DEBUG
	printf("rle: '");
	print_escaped(rle, rle_len);
	printf("'\n");
	#endif

	//xmt_readtimer32(start);
	int block[rle_len];
	pack(rle, block, rle_len);
	//xmt_readtimer32(end);
	//printf("PACK cycles: %d\n", end - start);

	//xmt_readtimer32(start);
	int index = compute_bwt(block, rle_len, bwt);
	xmt_readtimer32(end);
	printf(" BWT cycles: %d\n", end - start);
	#ifdef DEBUG
	printf(" bwt: ('");
	printi_escaped(bwt, rle_len);
	printf("',%d)\n", index);
	#endif
//#define BWT_ONLY
//#ifndef BWT_ONLY
	xmt_readtimer32(start);
	int used_chars[ALPHABET_SIZE];
	int n_used_chars;
	int mtflen = mtf_encode(bwt, mtf_stream, rle_len, used_chars, &n_used_chars);
	xmt_readtimer32(end);
	printf(" MTF cycles: %d\n", end - start);
	printf(" MTF length: %d\n", mtflen);
	#ifdef DEBUG
	printf(" mtf: (");
	printi(mtf_stream, mtflen);
	printf(",%d)\n", index);
	#endif
	//return 0;

	dump(mtf_stream, "mtf.txt");

	xmt_readtimer32(start);
	int n_groups = 1;
	int n_selectors = CDIV(mtflen, BYTES_PER_SELECTOR);
	int freq[n_used_chars];
	STARTTIME();
	freq_count(mtf_stream, mtflen, freq, n_used_chars);
	SHOWTIME("freq_count");
	int lengths[n_used_chars];
	int minlen, maxlen;
	build_huffman_lengths(freq, n_used_chars, lengths, &minlen, &maxlen);
	huffman_table_entry huff_table[n_used_chars];
	huffman_lengths_to_table(lengths, n_used_chars, minlen, maxlen, huff_table);
	SHOWTIME("build_huffman_tree");

	int used_map = 0;
	int used_bitmaps[16];
	for (i = 0; i < 16; i++) {
		int used_bitmap = 0;
		for (j = 0; j < 16; j++)
			used_bitmap = used_bitmap << 1 | used_chars[16 * i + j];
		used_bitmaps[i] = used_bitmap;
		used_map = used_map << 1 | (used_bitmap != 0);
	}

	BSTREAM_WRITE(ostr, 0x31415926, 32); // compressed_magic_1
	BSTREAM_WRITE(ostr, 0x5359, 16); // compressed_magic_2
	BSTREAM_WRITE(ostr, 0, 32); // CRC  TODO
	BSTREAM_WRITE_BIT(ostr, 0); // randomized
	BSTREAM_WRITE(ostr, index, 24); // origPtr
	BSTREAM_WRITE(ostr, used_map, 16); // huffman_used_map
	for (i = 0; i < 16; i++)
		if (used_bitmaps[i] != 0)
			BSTREAM_WRITE(ostr, used_bitmaps[i], 16);
	BSTREAM_WRITE(ostr, n_groups, 3); // huffman_groups
	BSTREAM_WRITE(ostr, n_selectors, 15); // selectors_used
	
	// selector_list
	//write_selectors(ostr, selectors, n_selectors); 
	
	int hlen = lengths[0];
	BSTREAM_WRITE(ostr, hlen, 5); // start_huffman_length
	for (i = 1; i < n_used_chars; i++) {
		int newlen = lengths[i];
		// delta_bit_length
		while (newlen > hlen) {
			BSTREAM_WRITE_BIT(ostr, 1); // alter length
			BSTREAM_WRITE_BIT(ostr, 0); // increment length
			hlen++;
		}
		while (newlen < hlen) {
			BSTREAM_WRITE_BIT(ostr, 1); // alter length
			BSTREAM_WRITE_BIT(ostr, 1); // decrement length
			hlen--;
		}
		BSTREAM_WRITE_BIT(ostr, 0); // next symbol
	}
	
	huffman_stream_len = BSTREAM_FLUSH(ostr);

	#ifdef DEBUG
	show_table(huff_table, n_used_chars);
	#endif
	int hufbegin = huffman_stream_len;
	int huflen = huffman_encode(mtf_stream, mtflen, huff_table, huffman_stream, huffman_stream_len);
	huffman_stream_len += huflen;

	#if 0  // TODO: remove #if
	// Write file footer
	BSTREAM_SEEK(ostr, huffman_stream_len);
	BSTREAM_WRITE(ostr, 0x17724538, 32); // eos_magic_1
	BSTREAM_WRITE(ostr, 0x5090, 16); // eos_magic_2
	BSTREAM_WRITE(ostr, 0, 32); // CRC  TODO
	huffman_stream_len = BSTREAM_FLUSH(ostr);
	#endif

	xmt_readtimer32(end);
	printf(" HUF cycles: %d\n", end - start);
	#if DEBUG
	printf(" huf: ");
	show_bitstream(huffman_stream, hufbegin, huflen);
	#endif
#endif

	dump(huffman_stream, "hufstream.txt");
	//printf("#define HUFSTREAM_BITS %d\n", huffman_stream_len);
	//printf("#define BWT_INDEX %d\n", index);
	/*
	FILE *f = fopen("huftree.txt", "w");
	for (int i = 0; i < huff_tree_size; i++)
		fprintf(f, "%d\n", huff_tree[i]);
	fclose(f);

	printf("stream size (bits): %d\n", huffman_stream_len);
	FILE *f = fopen("hufstream.txt", "w");
	for (int i = 0; i < CDIV(huffman_stream_len, 32); i++)
		fprintf(f, "%d\n", huffman_stream[i]);
	fclose(f);
	return;
	*/
}

void do_decompress() {
	int start, end;
	int i, j;

	int crc;
	int index;
	int used_map = 0;
	int used_bitmaps[16];
	int used_chars[ALPHABET_SIZE];
	int n_groups;
	int n_selectors;
	
	xmt_readtimer32(start);
	
	// Read file header
	bstream_t istr;
	int readval;
	BSTREAM_INIT_READ(istr, huffman_stream, 0);
	BSTREAM_READ(istr, readval, 16); ASSERT(readval == 0x425a); // magic
	BSTREAM_READ(istr, readval, 8); ASSERT(readval == 0x68); // version
	BSTREAM_READ(istr, readval, 8); ASSERT(readval == 0x30 + 9); // hundred_k_blocksize

	// Read block header
	BSTREAM_READ(istr, readval, 32); ASSERT(readval == 0x31415926); // compressed_magic_1
	BSTREAM_READ(istr, readval, 16); ASSERT(readval == 0x5359); // compressed_magic_2
	BSTREAM_READ(istr, crc, 32); // CRC  TODO
	BSTREAM_READ_BIT(istr, readval); ASSERT(readval == 0); // randomized
	BSTREAM_READ(istr, index, 24); // origPtr
	BSTREAM_READ(istr, used_map, 16); // huffman_used_map
	int n_used_chars = 2;
	for (i = 0; i < 16; i++) {
		if (used_map & (1 << 15 - i))
			BSTREAM_READ(istr, readval, 16);
		else
			readval = 0;
		for (j = 0; j < 16; j++) {
			int used = readval >> 15 - j & 1;
			used_chars[16 * i + j] = used;
			n_used_chars += used;
		}
	}
	BSTREAM_READ(istr, n_groups, 3); // huffman_groups
	BSTREAM_READ(istr, n_selectors, 15); // selectors_used
	
	// selector_list
	//read_selectors(istr, selectors, n_selectors); 
	
	int lengths[n_used_chars];
	int hlen;
	BSTREAM_READ(istr, hlen, 5); // start_huffman_length
	lengths[0] = hlen;
	int minlen = hlen;
	int maxlen = hlen;
	for (i = 1; i < n_used_chars; i++) {
		int bit;
		// delta_bit_length
		while (1) {
			BSTREAM_READ_BIT(istr, bit);
			if (bit == 0)
				break; // next symbol
			BSTREAM_READ_BIT(istr, bit);
			if (bit == 0) // increment length
				hlen++;
			else // decrement length
				hlen--;
		}
		lengths[i] = hlen;
		if (hlen < minlen)
			minlen = hlen;
		if (hlen > maxlen)
			maxlen = hlen;
	}	

	huffman_node huff_tree[n_used_chars];
	huffman_lengths_to_tree(lengths, n_used_chars, minlen, maxlen, huff_tree);
	#ifdef DEBUG
	show_tree(huff_tree, 0, 0);
	#endif
	int hufsize = huffman_decode(huffman_stream, BSTREAM_TELL(istr), huffman_stream_len, huff_tree, ihuf_stream, n_used_chars - 1);
	xmt_readtimer32(end);
	printf("IHUF cycles: %d\n", end - start);
	printf("decoded size (bytes) = %d\n", hufsize);
	#ifdef DEBUG
	printf("ihuf: (");
	printi(ihuf_stream, hufsize);
	printf(",%d)\n", index);
	#endif
	dumpb(ihuf_stream, "ihuf_stream.bin");

	xmt_readtimer32(start);
	int imtflen = mtf_decode(ihuf_stream, imtf_stream, hufsize, used_chars);
	xmt_readtimer32(end);
	printf("IMTF cycles: %d\n", end - start);
	#ifdef DEBUG
	printf("imtf: ('");
	printi_escaped(imtf_stream, imtflen);
	printf("',%d)\n", index);
	#endif
//#endif
	xmt_readtimer32(start);
	ibwt(imtf_stream, index, ibwt_stream, imtflen);
	//xmt_readtimer32(end);
	//printf("IBWT cycles: %d\n", end - start);
	#ifdef DEBUG
	printf("ibwt: '");
	printi_escaped(ibwt_stream, imtflen);
	printf("'\n");
	#endif
	//xmt_readtimer32(start);
	int finallen = rle_decode(ibwt_stream, round_trip, imtflen);
	xmt_readtimer32(end);
	printf("IRLE cycles: %d\n", end - start);
	#ifdef DEBUG
	printf("done: '");
	print_escaped(round_trip, finallen);
	printf("'\n");
	#else
	//printf("done: ");
	//prints(imtf_stream, MIN(N, 10000));
	//printf("\n");
	#endif
	dumpb(round_trip, "round_trip.txt");
}

int main() {
	int start, end;
	
	xmt_readtimer32(start);
	do_compress();
	xmt_readtimer32(end);
	printf("* COMPRESS cycles: %d\n", end - start);
	printf("Compressed size: %d bits = %d bytes\n", huffman_stream_len, CDIV(huffman_stream_len, 8));
	
	xmt_readtimer32(start);
	do_decompress();
	xmt_readtimer32(end);
	printf("* DECOMPRESS cycles: %d\n", end - start);
	return 0;
}
