#include <stdio.h>
#include <spu_intrinsics.h>
#include <spu_mfcio.h>
#include "mt_mine.h"

/* Period parameters */
#define N 624
#define M 397
#define MATRIX_A 0x9908b0dfUL   /* constant vector a */
#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
#define LOWER_MASK 0x7fffffffUL /* least significant r bits */

static unsigned long mt[N] __attribute__((aligned(16))); 
static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */

/* initializes mt[N] with a seed */
void init_genrand_mine(unsigned long s)
{
  mt[0]= s & 0xffffffffUL;
  for (mti=1; mti<N; mti++) {
    mt[mti] =
      (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
    /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
    /* In the previous versions, MSBs of the seed affect   */
    /* only MSBs of the array mt[].                        */
    /* 2002/01/09 modified by Makoto Matsumoto             */
    mt[mti] &= 0xffffffffUL;
    /* for >32 bit machines */
  }
}

unsigned int
genrand_mine(int num_rand)
{
    int i;
    int k, k1, k2, k3;
    const vec_uchar16 vshuf = (vec_uchar16){4, 5, 6, 7, 8, 9, 10, 11,
                                            12, 13, 14, 15, 16, 17, 18, 19};
    const vec_uint4 vone = spu_splats(0x1U);
    const vec_uint4 vsel = spu_splats(0x7fffffffU);
    vec_uint4 *vmt = (vec_uint4 *)mt;
    // ѿλȤޤ路ơ٤̤ѿȤ
    vec_uint4 v1a[4], v1b[4], v2a[4], v2b[4], v1[4], v2[4];
    vec_uint4 vmag01[4];
    vec_uint4 vsum = spu_splats(0U);
    vec_uint4 vy[4], vy2[4];
    int loop1 = num_rand / 624;
    int loop2 = num_rand - loop1 * 624;

    for (i = 0; i < loop1; i++) {

        k1 = 1;
        k2 = 99;
        k3 = 100;

        v1a[0] = vmt[0];
        v1a[1] = vmt[1];
        v1a[2] = vmt[2];
        v1a[3] = vmt[3];

        v1b[0] = vmt[1];
        v1b[1] = vmt[2];
        v1b[2] = vmt[3];
        v1b[3] = vmt[4];

        v2a[0] = vmt[99];
        v2a[1] = vmt[100];
        v2a[2] = vmt[101];
        v2a[3] = vmt[102];

        v2b[0] = vmt[100];
        v2b[1] = vmt[101];
        v2b[2] = vmt[102];
        v2b[3] = vmt[103];

        for (k = 4; k < 156; k += 4) {


            ////////////////////

            v1[0] = spu_shuffle(v1a[0], v1b[0], vshuf);
            v1[1] = spu_shuffle(v1a[1], v1b[1], vshuf);
            v1[2] = spu_shuffle(v1a[2], v1b[2], vshuf);
            v1[3] = spu_shuffle(v1a[3], v1b[3], vshuf);

            v2[0] = spu_shuffle(v2a[0], v2b[0], vshuf);
            v2[1] = spu_shuffle(v2a[1], v2b[1], vshuf);
            v2[2] = spu_shuffle(v2a[2], v2b[2], vshuf);
            v2[3] = spu_shuffle(v2a[3], v2b[3], vshuf);

            vy[0] = spu_sel(v1a[0], v1[0], vsel);
            vy[1] = spu_sel(v1a[1], v1[1], vsel);
            vy[2] = spu_sel(v1a[2], v1[2], vsel);
            vy[3] = spu_sel(v1a[3], v1[3], vsel);


            vmag01[0] = spu_and(spu_cmpeq(spu_and(vy[0], vone), vone),
                                0x9908b0dfU);
            vmag01[1] = spu_and(spu_cmpeq(spu_and(vy[1], vone), vone),
                                0x9908b0dfU);
            vmag01[2] = spu_and(spu_cmpeq(spu_and(vy[2], vone), vone),
                                0x9908b0dfU);
            vmag01[3] = spu_and(spu_cmpeq(spu_and(vy[3], vone), vone),
                                0x9908b0dfU);

            vy2[0] = spu_xor(spu_xor(v2[0], spu_rlmask(vy[0], -1)), vmag01[0]);

            vy2[1] = spu_xor(spu_xor(v2[1], spu_rlmask(vy[1], -1)), vmag01[1]);

            vy2[2] = spu_xor(spu_xor(v2[2], spu_rlmask(vy[2], -1)), vmag01[2]);

            vy2[3] = spu_xor(spu_xor(v2[3], spu_rlmask(vy[3], -1)), vmag01[3]);
            
            vmt[k - 4] = vy2[0];
            vmt[k - 3] = vy2[1];
            vmt[k - 2] = vy2[2];
            vmt[k - 1] = vy2[3];

            vy2[0] = spu_xor(vy2[0], spu_rlmask(vy2[0], -11));
            vy2[1] = spu_xor(vy2[1], spu_rlmask(vy2[1], -11));
            vy2[2] = spu_xor(vy2[2], spu_rlmask(vy2[2], -11));
            vy2[3] = spu_xor(vy2[3], spu_rlmask(vy2[3], -11));

            vy2[0] = spu_xor(vy2[0], spu_and(spu_sl(vy2[0], 7), 0x9d2c5680U));
            vy2[1] = spu_xor(vy2[1], spu_and(spu_sl(vy2[1], 7), 0x9d2c5680U));
            vy2[2] = spu_xor(vy2[2], spu_and(spu_sl(vy2[2], 7), 0x9d2c5680U));
            vy2[3] = spu_xor(vy2[3], spu_and(spu_sl(vy2[3], 7), 0x9d2c5680U));

            vy2[0] = spu_xor(vy2[0], spu_and(spu_sl(vy2[0], 15), 0xefc60000U));
            vy2[1] = spu_xor(vy2[1], spu_and(spu_sl(vy2[1], 15), 0xefc60000U));
            vy2[2] = spu_xor(vy2[2], spu_and(spu_sl(vy2[2], 15), 0xefc60000U));
            vy2[3] = spu_xor(vy2[3], spu_and(spu_sl(vy2[3], 15), 0xefc60000U));

            vy2[0] = spu_xor(vy2[0], spu_rlmask(vy2[0], -18));
            vy2[1] = spu_xor(vy2[1], spu_rlmask(vy2[1], -18));
            vy2[2] = spu_xor(vy2[2], spu_rlmask(vy2[2], -18));
            vy2[3] = spu_xor(vy2[3], spu_rlmask(vy2[3], -18));

            vsum = spu_add(vsum, vy2[0]);
            vsum = spu_add(vsum, vy2[1]);
            vsum = spu_add(vsum, vy2[2]);
            vsum = spu_add(vsum, vy2[3]);

            k1 += 4;
            k2 += 4;
            k3 += 4;

            ///// 

            v1a[0] = vmt[k];
            v1a[1] = vmt[k + 1];
            v1a[2] = vmt[k + 2];
            v1a[3] = vmt[k + 3];

            v1b[0] = vmt[k1];
            v1b[1] = vmt[k1 + 1];
            v1b[2] = vmt[k1 + 2];
            if (k1 == 153)
                k1 = -3;
            v1b[3] = vmt[k1 + 3];

            
            v2a[0] = vmt[k2];
            if (k2 == 155)
                k2 = -1;
            v2a[1] = vmt[k2 + 1];
            v2a[2] = vmt[k2 + 2];
            v2a[3] = vmt[k2 + 3];

            v2b[0] = vmt[k3];
            v2b[1] = vmt[k3 + 1];
            v2b[2] = vmt[k3 + 2];
            v2b[3] = vmt[k3 + 3];

            if (k3 == 152)
                k3 = -4;
        }

        v1[0] = spu_shuffle(v1a[0], v1b[0], vshuf);
        v1[1] = spu_shuffle(v1a[1], v1b[1], vshuf);
        v1[2] = spu_shuffle(v1a[2], v1b[2], vshuf);
        v1[3] = spu_shuffle(v1a[3], v1b[3], vshuf);

        v2[0] = spu_shuffle(v2a[0], v2b[0], vshuf);
        v2[1] = spu_shuffle(v2a[1], v2b[1], vshuf);
        v2[2] = spu_shuffle(v2a[2], v2b[2], vshuf);
        v2[3] = spu_shuffle(v2a[3], v2b[3], vshuf);

        vy[0] = spu_sel(v1a[0], v1[0], vsel);
        vy[1] = spu_sel(v1a[1], v1[1], vsel);
        vy[2] = spu_sel(v1a[2], v1[2], vsel);
        vy[3] = spu_sel(v1a[3], v1[3], vsel);


        vmag01[0] = spu_and(spu_cmpeq(spu_and(vy[0], vone), vone), 0x9908b0dfU);
        vmag01[1] = spu_and(spu_cmpeq(spu_and(vy[1], vone), vone), 0x9908b0dfU);
        vmag01[2] = spu_and(spu_cmpeq(spu_and(vy[2], vone), vone), 0x9908b0dfU);
        vmag01[3] = spu_and(spu_cmpeq(spu_and(vy[3], vone), vone), 0x9908b0dfU);

        vy2[0] = spu_xor(spu_xor(v2[0], spu_rlmask(vy[0], -1)), vmag01[0]);

        vy2[1] = spu_xor(spu_xor(v2[1], spu_rlmask(vy[1], -1)), vmag01[1]);

        vy2[2] = spu_xor(spu_xor(v2[2], spu_rlmask(vy[2], -1)), vmag01[2]);

        vy2[3] = spu_xor(spu_xor(v2[3], spu_rlmask(vy[3], -1)), vmag01[3]);
            
        vmt[152] = vy2[0];
        vmt[153] = vy2[1];
        vmt[154] = vy2[2];
        vmt[155] = vy2[3];

        vy2[0] = spu_xor(vy2[0], spu_rlmask(vy2[0], -11));
        vy2[1] = spu_xor(vy2[1], spu_rlmask(vy2[1], -11));
        vy2[2] = spu_xor(vy2[2], spu_rlmask(vy2[2], -11));
        vy2[3] = spu_xor(vy2[3], spu_rlmask(vy2[3], -11));

        vy2[0] = spu_xor(vy2[0], spu_and(spu_sl(vy2[0], 7), 0x9d2c5680U));
        vy2[1] = spu_xor(vy2[1], spu_and(spu_sl(vy2[1], 7), 0x9d2c5680U));
        vy2[2] = spu_xor(vy2[2], spu_and(spu_sl(vy2[2], 7), 0x9d2c5680U));
        vy2[3] = spu_xor(vy2[3], spu_and(spu_sl(vy2[3], 7), 0x9d2c5680U));

        vy2[0] = spu_xor(vy2[0], spu_and(spu_sl(vy2[0], 15), 0xefc60000U));
        vy2[1] = spu_xor(vy2[1], spu_and(spu_sl(vy2[1], 15), 0xefc60000U));
        vy2[2] = spu_xor(vy2[2], spu_and(spu_sl(vy2[2], 15), 0xefc60000U));
        vy2[3] = spu_xor(vy2[3], spu_and(spu_sl(vy2[3], 15), 0xefc60000U));

        vy2[0] = spu_xor(vy2[0], spu_rlmask(vy2[0], -18));
        vy2[1] = spu_xor(vy2[1], spu_rlmask(vy2[1], -18));
        vy2[2] = spu_xor(vy2[2], spu_rlmask(vy2[2], -18));
        vy2[3] = spu_xor(vy2[3], spu_rlmask(vy2[3], -18));

        vsum = spu_add(vsum, vy2[0]);
        vsum = spu_add(vsum, vy2[1]);
        vsum = spu_add(vsum, vy2[2]);
        vsum = spu_add(vsum, vy2[3]);


    }


    k1 = 1;
    k2 = 99;
    k3 = 100;

    for (k = 0; k < loop2 / 4; k++) {
        v1a[0] = vmt[k];
        v1b[0] = vmt[k1];
        v2a[0] = vmt[k2];
        v2b[0] = vmt[k3];

        v1[0] = spu_shuffle(v1a[0], v1b[0], vshuf);
        v2[0] = spu_shuffle(v2a[0], v2b[0], vshuf);
        vy[0] = spu_sel(v1a[0], v1[0], vsel);
        vmag01[0] = spu_and(spu_cmpeq(spu_and(vy[0], vone), vone), 0x9908b0dfU);
        vy2[0] = spu_xor(spu_xor(v2[0], spu_rlmask(vy[0], -1)), vmag01[0]);

        vmt[k] = vy2[0];
        vy2[0] = spu_xor(vy2[0], spu_rlmask(vy2[0], -11));
        vy2[0] = spu_xor(vy2[0], spu_and(spu_sl(vy2[0], 7), 0x9d2c5680U));
        vy2[0] = spu_xor(vy2[0], spu_and(spu_sl(vy2[0], 15), 0xefc60000U));
        vy2[0] = spu_xor(vy2[0], spu_rlmask(vy2[0], -18));
        vsum = spu_add(vsum, vy2[0]);

        k1++;
        if (k1 == 156)
            k1 = 0;
        k2++;
        if (k2 == 156)
            k2 = 0;
        k3++;
        if (k3 == 156)
            k3 = 0;
    }

    vsum = spu_add(vsum, spu_rlqwbyte(vsum, 8));
    vsum = spu_add(vsum, spu_rlqwbyte(vsum, 4));
    return spu_extract(vsum, 0);
}
