#include <stdio.h>
#include <spu_intrinsics.h>
#include <spu_mfcio.h>
#include "mt_mine.h"

/* Period parameters */
#define N 624
#define M 397
#define MATRIX_A 0x9908b0dfUL   /* constant vector a */
#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
#define LOWER_MASK 0x7fffffffUL /* least significant r bits */

static unsigned long mt[N] __attribute__((aligned(16))); 
static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */

/* initializes mt[N] with a seed */
void init_genrand_mine(unsigned long s)
{
  mt[0]= s & 0xffffffffUL;
  for (mti=1; mti<N; mti++) {
    mt[mti] =
      (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
    /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
    /* In the previous versions, MSBs of the seed affect   */
    /* only MSBs of the array mt[].                        */
    /* 2002/01/09 modified by Makoto Matsumoto             */
    mt[mti] &= 0xffffffffUL;
    /* for >32 bit machines */
  }
}

unsigned int
genrand_mine(int num_rand)
{
    int i;
    int k, k1, k2, k3;
    const vec_uchar16 vshuf = (vec_uchar16){4, 5, 6, 7, 8, 9, 10, 11,
                                            12, 13, 14, 15, 16, 17, 18, 19};
    const vec_uint4 vone = spu_splats(0x1U);
    const vec_uint4 vsel = spu_splats(0x7fffffffU);
    vec_uint4 *vmt = (vec_uint4 *)mt;
    vec_uint4 v1a, v1b, v2a, v2b, v1, v2;
    vec_uint4 vmag01;
    vec_uint4 vsum = spu_splats(0U);
    vec_uint4 vy, vy2;
    int loop1 = num_rand / 624; // 624ܿʬᥤ롼פ
    int loop2 = num_rand - loop1 * 624; // ĤβʬüԤ

    for (i = 0; i < loop1; i++) {
        k1 = 1;
        k2 = 99;
        k3 = 100;
        for (k  = 0; k < 156; k++) {
            v1a = vmt[k];
            v1b = vmt[k1];
            v2a = vmt[k2];
            v2b = vmt[k3];

            v1 = spu_shuffle(v1a, v1b, vshuf);
            v2 = spu_shuffle(v2a, v2b, vshuf);
            vy = spu_sel(v1a, v1, vsel);
            vmag01 = spu_and(spu_cmpeq(spu_and(vy, vone), vone), 0x9908b0dfU);
            vmt[k] = spu_xor(spu_xor(v2, spu_rlmask(vy, -1)), vmag01);

            vy2 = vmt[k];
            vy2 = spu_xor(vy2, spu_rlmask(vy2, -11));
            vy2 = spu_xor(vy2, spu_and(spu_sl(vy2, 7), 0x9d2c5680U));
            vy2 = spu_xor(vy2, spu_and(spu_sl(vy2, 15), 0xefc60000U));
            vy2 = spu_xor(vy2, spu_rlmask(vy2, -18));
            vsum = spu_add(vsum, vy2);

            k1++;
            if (k1 == 156)
                k1 = 0;
            k2++;
            if (k2 == 156)
                k2 = 0;
            k3++;
            if (k3 == 156)
                k3 = 0;
        }
    }

    k1 = 1;
    k2 = 99;
    k3 = 100;

    for (k = 0; k < loop2 / 4; k++) {
        v1a = vmt[k];
        v1b = vmt[k1];
        v2a = vmt[k2];
        v2b = vmt[k3];

        v1 = spu_shuffle(v1a, v1b, vshuf);
        v2 = spu_shuffle(v2a, v2b, vshuf);
        vy = spu_sel(v1a, v1, vsel);
        vmag01 = spu_and(spu_cmpeq(spu_and(vy, vone), vone), 0x9908b0dfU);
        vmt[k] = spu_xor(spu_xor(v2, spu_rlmask(vy, -1)), vmag01);

        vy2 = vmt[k];
        vy2 = spu_xor(vy2, spu_rlmask(vy2, -11));
        vy2 = spu_xor(vy2, spu_and(spu_sl(vy2, 7), 0x9d2c5680U));
        vy2 = spu_xor(vy2, spu_and(spu_sl(vy2, 15), 0xefc60000U));
        vy2 = spu_xor(vy2, spu_rlmask(vy2, -18));
        vsum = spu_add(vsum, vy2);

        k1++;
        if (k1 == 156)
            k1 = 0;
        k2++;
        if (k2 == 156)
            k2 = 0;
        k3++;
        if (k3 == 156)
            k3 = 0;
    }

    vsum = spu_add(vsum, spu_rlqwbyte(vsum, 8));
    vsum = spu_add(vsum, spu_rlqwbyte(vsum, 4));
    return spu_extract(vsum, 0);
}
