#include <stdio.h>
#include <spu_intrinsics.h>
#include <spu_mfcio.h>
#include "mt_mine.h"

/* Period parameters */
#define N 624
#define M 397
#define MATRIX_A 0x9908b0dfUL   /* constant vector a */
#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
#define LOWER_MASK 0x7fffffffUL /* least significant r bits */

static unsigned long mt[N] __attribute__((aligned(16))); 
static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */

/* initializes mt[N] with a seed */
void init_genrand_mine(unsigned long s)
{
  mt[0]= s & 0xffffffffUL;
  for (mti=1; mti<N; mti++) {
    mt[mti] =
      (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
    /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
    /* In the previous versions, MSBs of the seed affect   */
    /* only MSBs of the array mt[].                        */
    /* 2002/01/09 modified by Makoto Matsumoto             */
    mt[mti] &= 0xffffffffUL;
    /* for >32 bit machines */
  }
}

unsigned int
genrand_mine(int num_rand)
{
    int i;
    int k;
    const vec_uchar16 vshuf = (vec_uchar16){4, 5, 6, 7, 8, 9, 10, 11,
                                            12, 13, 14, 15, 16, 17, 18, 19};
    const vec_uint4 vone = spu_splats(0x1U);
    const vec_uint4 vsel = spu_splats(0x7fffffffU);
    vec_uint4 *vmt = (vec_uint4 *)mt;
    vec_uint4 v1a, v1b, v2a, v2b, v1, v2;
    vec_uint4 vmag01;
    vec_uint4 vsum = spu_splats(0U);
    vec_uint4 vy, vy2;

    for (i = 0, k = 0; i < num_rand / 4; i++, k++) {
        vec_int4 vk, vk1, vk2, vk3;
        vec_int4 vp1, vp99, vp100, vm56, vm57;

        if (__builtin_expect((k == 156), 0))
            k = 0;

        v1a = vmt[k];

        vk = spu_promote(k, 0);       // vkword0k
        vp1 = spu_add(vk, 1);         // vp1word0k+1ˤ
        vp99 = spu_add(vk, 99);       // vp99word0k+99ˤ
        vp100 = spu_add(vk, 100);     // vp100word0k+100ˤ
        vm56 = spu_add(vk, (int)-56); // vm56word0k-56ˤ
        vm57 = spu_add(vk, (int)-57); // vm56word0k-57ˤ

        // vk1word0ϡvkword0154礭ä0ˤʤꡤ
        // Ǥʤvp1word0ˤʤ
        vk1 = spu_sel(vp1, spu_splats(0), spu_cmpgt(vk, 154));
        vk2 = spu_sel(vp99, vm57, spu_cmpgt(vk, 56));
        vk3 = spu_sel(vp100, vm56, spu_cmpgt(vk, 55));

        // vk1word0򥹥ΰˤvmtΥǥåȤѤ
        v1b = vmt[spu_extract(vk1, 0)];
        v2a = vmt[spu_extract(vk2, 0)];
        v2b = vmt[spu_extract(vk3, 0)];

        v1 = spu_shuffle(v1a, v1b, vshuf);
        v2 = spu_shuffle(v2a, v2b, vshuf);
        vy = spu_sel(v1a, v1, vsel);
        vmag01 = spu_and(spu_cmpeq(spu_and(vy, vone), vone), 0x9908b0dfU);
        vmt[k] = spu_xor(spu_xor(v2, spu_rlmask(vy, -1)), vmag01);

        vy2 = vmt[k];
        vy2 = spu_xor(vy2, spu_rlmask(vy2, -11));
        vy2 = spu_xor(vy2, spu_and(spu_sl(vy2, 7), 0x9d2c5680U));
        vy2 = spu_xor(vy2, spu_and(spu_sl(vy2, 15), 0xefc60000U));
        vy2 = spu_xor(vy2, spu_rlmask(vy2, -18));
        vsum = spu_add(vsum, vy2);
    }
    vsum = spu_add(vsum, spu_rlqwbyte(vsum, 8));
    vsum = spu_add(vsum, spu_rlqwbyte(vsum, 4));
    return spu_extract(vsum, 0);
}
