#include <xmtc.h>
#include "profile.h"
#include "lib/arith.h"
#include "lib/rand1m2.h"
#include "lib/shift.h"
#include "lib/sum.h"


static unsigned __inline__ swap_bytes(unsigned x) {
	return x << 24 | x << 8 & 0xff0000 | x >> 8 & 0xff00 | x >> 24;
}

void pack(const unsigned char *input, unsigned *block, int n) {
	const unsigned *input_words = (const unsigned *)input;
	int end = (n - 5) / 4;
	if (end > 0) spawn(0, end - 1) {
		int next = input_words[$ + 1];
		int value = swap_bytes(input_words[$]);  block[4*$    ] = value;
		value = value << 8 | next       & 0xff;  block[4*$ + 1] = value;
		value = value << 8 | next >> 8  & 0xff;  block[4*$ + 2] = value;
		value = value << 8 | next >> 16 & 0xff;  block[4*$ + 3] = value;
	}


	int i;
	int value = swap_bytes(input_words[end]);
	unsigned int next = swap_bytes(input_words[end + 1]);
	unsigned int first = swap_bytes(input_words[0]);
	int remain = n - 4 * end;
	int lower_bits = 8 * (8 - remain);
	int lower_mask = (1 << lower_bits) - 1;
	if (lower_bits > 0)
		next = next & ~lower_mask | first >> 32 - lower_bits;
	first <<= lower_bits;
	for (i = 0; i < remain; i++) {
		block[4*end + i] = value;
		value = value << 8 | next >> 24;
		next = next << 8 | first >> 24;
		first <<= 8;
	}
}

static int __inline__ compare(int x, int y, const unsigned *block, int nblock) {
	if (x == y) return 0;
	int i;
	for (i = 0; i < nblock; i++) {
		unsigned bx = block[x], by = block[y];
		if (bx != by) return bx > by ? 1 : -1;
		x += 4; if (x >= nblock) x -= nblock;
		y += 4; if (y >= nblock) y -= nblock;
	}

	return 0;
}

//  Source: http://alienryderflex.com/quicksort/
//  Adapted by James Edwards
//
//  quicksort
//
//  This public-domain C implementation by Darel Rex Finley.
//
//  * This function assumes it is called with valid parameters.
//
//  * Example calls:
//    quicksort(&myArray[0],5); // sorts elements 0, 1, 2, 3, and 4
//    quicksort(&myArray[3],5); // sorts elements 3, 4, 5, 6, and 7
static void quicksort(int *arr, int elements, const unsigned *block, int nblock) {
	#define SWAP(x, y) do { typeof(x) t = x; x = y; y = t; } while (0)
	#define QSORT_MAX_LEVELS 32
	int beg[QSORT_MAX_LEVELS], end[QSORT_MAX_LEVELS];
	int i = 0;
	int L, R;
	int piv;

	beg[0] = 0; end[0] = elements;
	while (i >= 0) {
		L = beg[i]; R = end[i]-1;
		if (L < R) {
			piv = arr[L];
			while (L < R) {
				while (compare(arr[R], piv, block, nblock) >= 0 && L < R) R--;
				if (L < R) arr[L++] = arr[R];
				while (compare(arr[L], piv, block, nblock) <= 0 && L < R) L++;
				if (L < R) arr[R--] = arr[L];
			}
			arr[L] = piv; beg[i+1] = L+1; end[i+1] = end[i]; end[i++] = L;
			if (end[i] - beg[i] > end[i-1] - beg[i-1]) {
				SWAP(beg[i], beg[i-1]);
				SWAP(end[i], end[i-1]);
			}
		}
		else {
			i--;
		}
	}
}

static int binsearch(const int *a, int len, int x, const unsigned *block, int nblock) {
	int left = 0;
	int right = len;
	while (left < right) {
		int mid = (left + right) / 2;
		int a_mid;
		loadROBuffer2(a_mid, a[mid]);
		if (compare(x, a_mid, block, nblock) < 0)
			right = mid;
		else
			left = mid + 1;
	}
	return left;
}

#ifndef SAMPLESORT_FACTOR
#define SAMPLESORT_FACTOR 64
#endif

#define SAMPLESORT_INPUT(i) a[i]
static void samplesort(const int *a, int len, int *out, const unsigned *block, int nblock)
#include "samplesort_inc.c"
#undef SAMPLESORT_INPUT

#define SAMPLESORT_INPUT(i) i
static void samplesort_first(int len, int *out, const unsigned *block, int nblock)
#include "samplesort_inc.c"
#undef SAMPLESORT_INPUT

void compute_suffix_array(const unsigned *block, int len, int *sa) {
	samplesort_first(len, sa, block, len);
}

int compute_bwt(const unsigned *block, int len, int *bwt) {
	int sa[len];
	compute_suffix_array(block, len, sa);

	int begin;
	spawn(0, len - 1) {
		int j = sa[$];
		if (j == 0) {
			begin = $;
			j = len;
		}
		SRL(bwt[$], block[j - 1], 24);
	}
	SHOWTIME("permute");
	printf("BWT index = %d\n", begin);
	return begin;
}
