#include <stdio.h>              /*  */
#include <spu_intrinsics.h>
#include <spu_mfcio.h>
#include "mt_mine.h"

/* Period parameters */
#define N 624
#define M 397
#define MATRIX_A 0x9908b0dfUL   /* constant vector a */
#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
#define LOWER_MASK 0x7fffffffUL /* least significant r bits */

//////////////////////////
extern unsigned ls_high[2], ls_low[2];
extern int speid;
volatile int rp __attribute__((aligned(16)));
volatile int wp __attribute__((aligned(16)));
unsigned wp_ea_high, wp_ea_low;
unsigned rp_ea_high, rp_ea_low;
#define BUFFERS 4
unsigned mt_ea_high[BUFFERS][2], mt_ea_low[BUFFERS][2];
#define TAGID 3

volatile unsigned barrier_buf[2][4] __attribute__((aligned(16))) = {{0}};
unsigned barrier_buf_eal, barrier_buf_eah;
//////////////////////////

static unsigned mt[N * BUFFERS] __attribute__((aligned(16)));

static int mti = N + 1; /* mti==N+1 means mt[N] is not initialized */


/* initializes mt[N] with a seed */
void init_genrand_mine(unsigned long s)
{
    mt[0]= s & 0xffffffffUL;
    for (mti=1; mti<N; mti++) {
        mt[mti] =
            (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
        /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
        /* In the previous versions, MSBs of the seed affect   */
        /* only MSBs of the array mt[].                        */
        /* 2002/01/09 modified by Makoto Matsumoto             */
        mt[mti] &= 0xffffffffUL;
        /* for >32 bit machines */
    }
}


void
genrand_mine_update(int loop1)
{
    int i;

    const vec_uchar16 vshuf = (vec_uchar16){4, 5, 6, 7, 8, 9, 10, 11,
                                            12, 13, 14, 15, 16, 17, 18, 19};
    const vec_uint4 vone = spu_splats(0x1U);
    const vec_uint4 vsel = spu_splats(0x7fffffffU);

    vec_uint4 *vmt = (vec_uint4 *)mt;

    vec_uint4 v1a[4], v1b[4], v2a[4], v2b[4], v1[4], v2[4];
    vec_uint4 vmag01[4];
    vec_uint4 vy[4];

    int k, k1, k2, k3;

    int wpnext;

    for (i = 0; i < loop1; i++) {

        k1 = 1;
        k2 = 99;
        k3 = 100;

        v1a[0] = vmt[0];
        v1a[1] = vmt[1];
        v1a[2] = vmt[2];
        v1a[3] = vmt[3];

        v1b[0] = vmt[1];
        v1b[1] = vmt[2];
        v1b[2] = vmt[3];
        v1b[3] = vmt[4];

        v2a[0] = vmt[99];
        v2a[1] = vmt[100];
        v2a[2] = vmt[101];
        v2a[3] = vmt[102];

        v2b[0] = vmt[100];
        v2b[1] = vmt[101];
        v2b[2] = vmt[102];
        v2b[3] = vmt[103];


        for (k = 4; k < 76; k += 4) {


            v1[0] = spu_shuffle(v1a[0], v1b[0], vshuf);
            v1[1] = spu_shuffle(v1a[1], v1b[1], vshuf);
            v1[2] = spu_shuffle(v1a[2], v1b[2], vshuf);
            v1[3] = spu_shuffle(v1a[3], v1b[3], vshuf);

            v2[0] = spu_shuffle(v2a[0], v2b[0], vshuf);
            v2[1] = spu_shuffle(v2a[1], v2b[1], vshuf);
            v2[2] = spu_shuffle(v2a[2], v2b[2], vshuf);
            v2[3] = spu_shuffle(v2a[3], v2b[3], vshuf);

            vy[0] = spu_sel(v1a[0], v1[0], vsel);
            vy[1] = spu_sel(v1a[1], v1[1], vsel);
            vy[2] = spu_sel(v1a[2], v1[2], vsel);
            vy[3] = spu_sel(v1a[3], v1[3], vsel);

            vmag01[0] = spu_and(spu_maskw(spu_extract(spu_gather(vy[0]), 0)),
                                0x9908b0dfU);
            vmag01[1] = spu_and(spu_maskw(spu_extract(spu_gather(vy[1]), 0)),
                                0x9908b0dfU);
            vmag01[2] = spu_and(spu_maskw(spu_extract(spu_gather(vy[2]), 0)),
                                0x9908b0dfU);
            vmag01[3] = spu_and(spu_maskw(spu_extract(spu_gather(vy[3]), 0)),
                                0x9908b0dfU);

            vmt[k - 4] = spu_xor(spu_xor(v2[0], spu_rlmask(vy[0], -1)), 
                                 vmag01[0]);
            vmt[k - 3] = spu_xor(spu_xor(v2[1], spu_rlmask(vy[1], -1)), 
                                 vmag01[1]);
            vmt[k - 2] = spu_xor(spu_xor(v2[2], spu_rlmask(vy[2], -1)), 
                                 vmag01[2]);
            vmt[k - 1] = spu_xor(spu_xor(v2[3], spu_rlmask(vy[3], -1)), 
                                 vmag01[3]);
            
            k1 += 4;
            k2 += 4;
            k3 += 4;

            ///// 

            v1a[0] = vmt[k];
            v1a[1] = vmt[k + 1];
            v1a[2] = vmt[k + 2];
            v1a[3] = vmt[k + 3];

            v1b[0] = vmt[k + 1];
            v1b[1] = vmt[k + 2];
            v1b[2] = vmt[k + 3];
            if (k1 == 153)
                k1 = -3;
            v1b[3] = vmt[k1 + 3];

            
            v2a[0] = vmt[k2]; // 99 ... 155
            if (k2 == 155)
                k2 = -1;
            v2a[1] = vmt[k2 + 1]; // 100 ... 0
            v2a[2] = vmt[k2 + 2];
            v2a[3] = vmt[k2 + 3];

            v2b[0] = vmt[k2 + 1]; // 100 ... 0
            v2b[1] = vmt[k2 + 2];
            v2b[2] = vmt[k2 + 3];
            v2b[3] = vmt[k3 + 3];

            if (k3 == 152)
                k3 = -4;

        }


        v1[0] = spu_shuffle(v1a[0], v1b[0], vshuf);
        v1[1] = spu_shuffle(v1a[1], v1b[1], vshuf);
        v1[2] = spu_shuffle(v1a[2], v1b[2], vshuf);
        v1[3] = spu_shuffle(v1a[3], v1b[3], vshuf);

        v2[0] = spu_shuffle(v2a[0], v2b[0], vshuf);
        v2[1] = spu_shuffle(v2a[1], v2b[1], vshuf);
        v2[2] = spu_shuffle(v2a[2], v2b[2], vshuf);
        v2[3] = spu_shuffle(v2a[3], v2b[3], vshuf);

        vy[0] = spu_sel(v1a[0], v1[0], vsel);
        vy[1] = spu_sel(v1a[1], v1[1], vsel);
        vy[2] = spu_sel(v1a[2], v1[2], vsel);
        vy[3] = spu_sel(v1a[3], v1[3], vsel);

        vmag01[0] = spu_and(spu_cmpeq(spu_and(vy[0], vone), vone), 0x9908b0dfU);
        vmag01[1] = spu_and(spu_cmpeq(spu_and(vy[1], vone), vone), 0x9908b0dfU);
        vmag01[2] = spu_and(spu_cmpeq(spu_and(vy[2], vone), vone), 0x9908b0dfU);
        vmag01[3] = spu_and(spu_cmpeq(spu_and(vy[3], vone), vone), 0x9908b0dfU);

           
        vmt[72] = spu_xor(spu_xor(v2[0], spu_rlmask(vy[0], -1)), vmag01[0]);
        vmt[73] = spu_xor(spu_xor(v2[1], spu_rlmask(vy[1], -1)), vmag01[1]);
        vmt[74] = spu_xor(spu_xor(v2[2], spu_rlmask(vy[2], -1)), vmag01[2]);
        vmt[75] = spu_xor(spu_xor(v2[3], spu_rlmask(vy[3], -1)), vmag01[3]);


        k1 = 77;
        k2 = 19;
        k3 = 20;



        v1a[0] = vmt[76];
        v1a[1] = vmt[77];
        v1a[2] = vmt[78];
        v1a[3] = vmt[79];

        v1b[0] = vmt[77];
        v1b[1] = vmt[78];
        v1b[2] = vmt[79];
        v1b[3] = vmt[80];

        v2a[0] = vmt[19];
        v2a[1] = vmt[20];
        v2a[2] = vmt[21];
        v2a[3] = vmt[22];

        v2b[0] = vmt[20];
        v2b[1] = vmt[21];
        v2b[2] = vmt[22];
        v2b[3] = vmt[23];

        // Υƥ졼ȯԤȾžäƤ뤫ǧ
        //  (Υƥ졼ǽ٤Τ)
        spu_mfcstat(MFC_TAG_UPDATE_ALL);

        // ȾΥǡž׵
        spu_mfcdma64(&vmt[0], mt_ea_high[wp][0], mt_ea_low[wp][0],
                     sizeof(vmt[0]) * 76, TAGID, MFC_PUT_CMD);

        for (k = 80; k < 156; k += 4) {

            v1[0] = spu_shuffle(v1a[0], v1b[0], vshuf);
            v1[1] = spu_shuffle(v1a[1], v1b[1], vshuf);
            v1[2] = spu_shuffle(v1a[2], v1b[2], vshuf);
            v1[3] = spu_shuffle(v1a[3], v1b[3], vshuf);

            v2[0] = spu_shuffle(v2a[0], v2b[0], vshuf);
            v2[1] = spu_shuffle(v2a[1], v2b[1], vshuf);
            v2[2] = spu_shuffle(v2a[2], v2b[2], vshuf);
            v2[3] = spu_shuffle(v2a[3], v2b[3], vshuf);

            vy[0] = spu_sel(v1a[0], v1[0], vsel);
            vy[1] = spu_sel(v1a[1], v1[1], vsel);
            vy[2] = spu_sel(v1a[2], v1[2], vsel);
            vy[3] = spu_sel(v1a[3], v1[3], vsel);

            vmag01[0] = spu_and(spu_maskw(spu_extract(spu_gather(vy[0]), 0)),
                                0x9908b0dfU);
            vmag01[1] = spu_and(spu_maskw(spu_extract(spu_gather(vy[1]), 0)),
                                0x9908b0dfU);
            vmag01[2] = spu_and(spu_maskw(spu_extract(spu_gather(vy[2]), 0)),
                                0x9908b0dfU);
            vmag01[3] = spu_and(spu_maskw(spu_extract(spu_gather(vy[3]), 0)),
                                0x9908b0dfU);

            vmt[k - 4] = spu_xor(spu_xor(v2[0], spu_rlmask(vy[0], -1)), 
                                 vmag01[0]);
            vmt[k - 3] = spu_xor(spu_xor(v2[1], spu_rlmask(vy[1], -1)), 
                                 vmag01[1]);
            vmt[k - 2] = spu_xor(spu_xor(v2[2], spu_rlmask(vy[2], -1)), 
                                 vmag01[2]);
            vmt[k - 1] = spu_xor(spu_xor(v2[3], spu_rlmask(vy[3], -1)), 
                                 vmag01[3]);

            k1 += 4;
            k2 += 4;
            k3 += 4;

            ///// 

            v1a[0] = vmt[k];
            v1a[1] = vmt[k + 1];
            v1a[2] = vmt[k + 2];
            v1a[3] = vmt[k + 3];

            v1b[0] = vmt[k + 1];
            v1b[1] = vmt[k + 2];
            v1b[2] = vmt[k + 3];
            if (k1 == 153)
                k1 = -3;
            v1b[3] = vmt[k1 + 3];

            
            v2a[0] = vmt[k2]; // 99 ... 155
            if (k2 == 155)
                k2 = -1;
            v2a[1] = vmt[k2 + 1]; // 100 ... 0
            v2a[2] = vmt[k2 + 2];
            v2a[3] = vmt[k2 + 3];

            v2b[0] = vmt[k2 + 1]; // 100 ... 0
            v2b[1] = vmt[k2 + 2];
            v2b[2] = vmt[k2 + 3];
            v2b[3] = vmt[k3 + 3];

            if (k3 == 152)
                k3 = -4;

        }

        v1[0] = spu_shuffle(v1a[0], v1b[0], vshuf);
        v1[1] = spu_shuffle(v1a[1], v1b[1], vshuf);
        v1[2] = spu_shuffle(v1a[2], v1b[2], vshuf);
        v1[3] = spu_shuffle(v1a[3], v1b[3], vshuf);

        v2[0] = spu_shuffle(v2a[0], v2b[0], vshuf);
        v2[1] = spu_shuffle(v2a[1], v2b[1], vshuf);
        v2[2] = spu_shuffle(v2a[2], v2b[2], vshuf);
        v2[3] = spu_shuffle(v2a[3], v2b[3], vshuf);

        vy[0] = spu_sel(v1a[0], v1[0], vsel);
        vy[1] = spu_sel(v1a[1], v1[1], vsel);
        vy[2] = spu_sel(v1a[2], v1[2], vsel);
        vy[3] = spu_sel(v1a[3], v1[3], vsel);

        vmag01[0] = spu_and(spu_cmpeq(spu_and(vy[0], vone), vone), 0x9908b0dfU);
        vmag01[1] = spu_and(spu_cmpeq(spu_and(vy[1], vone), vone), 0x9908b0dfU);
        vmag01[2] = spu_and(spu_cmpeq(spu_and(vy[2], vone), vone), 0x9908b0dfU);
        vmag01[3] = spu_and(spu_cmpeq(spu_and(vy[3], vone), vone), 0x9908b0dfU);

        vmt[152] = spu_xor(spu_xor(v2[0], spu_rlmask(vy[0], -1)), vmag01[0]);
        vmt[153] = spu_xor(spu_xor(v2[1], spu_rlmask(vy[1], -1)), vmag01[1]);
        vmt[154] = spu_xor(spu_xor(v2[2], spu_rlmask(vy[2], -1)), vmag01[2]);
        vmt[155] = spu_xor(spu_xor(v2[3], spu_rlmask(vy[3], -1)), vmag01[3]);

        // wp򹹿rpɤĤƤޤrpιԤ
        wpnext = wp + 1;
        if (wpnext == BUFFERS)
            wpnext = 0;
        
        if (__builtin_expect((wpnext == rp), 0))
            while (wpnext == rp)
                ;

        // ȾžäƤ뤫ǧ
        // (Υ롼פκǽǽ٤Τ)
        spu_mfcstat(MFC_TAG_UPDATE_ALL);

        // ȾΥǡž׵
        spu_mfcdma64(&vmt[76], mt_ea_high[wp][1], mt_ea_low[wp][1], 
                     sizeof(vmt[0]) * 80, TAGID, MFC_PUT_CMD);
        wp = wpnext;

        // ȾξžäƤե饰
        spu_mfcdma64(&wp, wp_ea_high, wp_ea_low, sizeof(wp), TAGID,
                     MFC_PUTF_CMD);
    }
    return;
}




vec_uint4
genrand_mine_tempering(int loop1)
{
    int i;
    vec_uint4 *vmt;
    vec_uint4 vsum;
    vec_uint4 vy2[4];
    int k;

    vsum = spu_splats(0U);

    for (i = 0; i < loop1; i++) {

        // ξƱ̤Υǡʤ
        if (__builtin_expect((rp == wp), 0))
            while (rp == wp)
                ;
        vmt = (vec_uint4 *)(mt + 624 * rp);

        vy2[0] = vmt[0];
        vy2[1] = vmt[1];
        vy2[2] = vmt[2];
        vy2[3] = vmt[3];

        for (k = 4; k < 156; k += 4) {
            vy2[0] = spu_xor(vy2[0], spu_rlmask(vy2[0], -11));
            vy2[1] = spu_xor(vy2[1], spu_rlmask(vy2[1], -11));
            vy2[2] = spu_xor(vy2[2], spu_rlmask(vy2[2], -11));
            vy2[3] = spu_xor(vy2[3], spu_rlmask(vy2[3], -11));

            vy2[0] = spu_xor(vy2[0], spu_and(spu_slqw(vy2[0], 7), 0x9d2c5680U));
            vy2[1] = spu_xor(vy2[1], spu_and(spu_slqw(vy2[1], 7), 0x9d2c5680U));
            vy2[2] = spu_xor(vy2[2], spu_and(spu_slqw(vy2[2], 7), 0x9d2c5680U));
            vy2[3] = spu_xor(vy2[3], spu_and(spu_slqw(vy2[3], 7), 0x9d2c5680U));

            vy2[0] = spu_xor(vy2[0], spu_and(spu_slqwbytebc(spu_slqw(vy2[0], 15), 15), 0xefc60000U));
            vy2[1] = spu_xor(vy2[1], spu_and(spu_slqwbytebc(spu_slqw(vy2[1], 15), 15), 0xefc60000U));
            vy2[2] = spu_xor(vy2[2], spu_and(spu_slqwbytebc(spu_slqw(vy2[2], 15), 15), 0xefc60000U));
            vy2[3] = spu_xor(vy2[3], spu_and(spu_slqwbytebc(spu_slqw(vy2[3], 15), 15), 0xefc60000U));

            vy2[0] = spu_xor(vy2[0], spu_rlmask(vy2[0], -18));
            vy2[1] = spu_xor(vy2[1], spu_rlmask(vy2[1], -18));
            vy2[2] = spu_xor(vy2[2], spu_rlmask(vy2[2], -18));
            vy2[3] = spu_xor(vy2[3], spu_rlmask(vy2[3], -18));

            vsum = spu_add(vsum, vy2[0]);
            vsum = spu_add(vsum, vy2[1]);
            vsum = spu_add(vsum, vy2[2]);
            vsum = spu_add(vsum, vy2[3]);

            vy2[0] = vmt[k];
            vy2[1] = vmt[k + 1];
            vy2[2] = vmt[k + 2];
            vy2[3] = vmt[k + 3];
        }

        vy2[0] = spu_xor(vy2[0], spu_rlmask(vy2[0], -11));
        vy2[1] = spu_xor(vy2[1], spu_rlmask(vy2[1], -11));
        vy2[2] = spu_xor(vy2[2], spu_rlmask(vy2[2], -11));
        vy2[3] = spu_xor(vy2[3], spu_rlmask(vy2[3], -11));

        vy2[0] = spu_xor(vy2[0], spu_and(spu_slqw(vy2[0], 7), 0x9d2c5680U));
        vy2[1] = spu_xor(vy2[1], spu_and(spu_slqw(vy2[1], 7), 0x9d2c5680U));
        vy2[2] = spu_xor(vy2[2], spu_and(spu_slqw(vy2[2], 7), 0x9d2c5680U));
        vy2[3] = spu_xor(vy2[3], spu_and(spu_slqw(vy2[3], 7), 0x9d2c5680U));

        vy2[0] = spu_xor(vy2[0], spu_and(spu_slqwbytebc(spu_slqw(vy2[0], 15), 15), 0xefc60000U));
        vy2[1] = spu_xor(vy2[1], spu_and(spu_slqwbytebc(spu_slqw(vy2[1], 15), 15), 0xefc60000U));
        vy2[2] = spu_xor(vy2[2], spu_and(spu_slqwbytebc(spu_slqw(vy2[2], 15), 15), 0xefc60000U));
        vy2[3] = spu_xor(vy2[3], spu_and(spu_slqwbytebc(spu_slqw(vy2[3], 15), 15), 0xefc60000U));

        vy2[0] = spu_xor(vy2[0], spu_rlmask(vy2[0], -18));
        vy2[1] = spu_xor(vy2[1], spu_rlmask(vy2[1], -18));
        vy2[2] = spu_xor(vy2[2], spu_rlmask(vy2[2], -18));
        vy2[3] = spu_xor(vy2[3], spu_rlmask(vy2[3], -18));

        vsum = spu_add(vsum, vy2[0]);
        vsum = spu_add(vsum, vy2[1]);
        vsum = spu_add(vsum, vy2[2]);
        vsum = spu_add(vsum, vy2[3]);

        // SPE0rp򹹿
        rp++;
        if (rp == BUFFERS)
            rp = 0;
        spu_mfcdma64(&rp, rp_ea_high, rp_ea_low, sizeof(rp), TAGID,
                     MFC_PUTF_CMD);

    }
    return vsum;
}


static void
barrier_init(int speid)
{
    unsigned long long ls_base, barrier_buf_ea;
    if (speid == 0) {
        ls_base = ((unsigned long long)ls_high[1] << 32) | ls_low[1];
        barrier_buf_ea = ls_base + (unsigned)&barrier_buf[0][0];
        barrier_buf[0][0] = 1;
    } else {
        barrier_buf[1][0] = 1;
        ls_base = ((unsigned long long)ls_high[0] << 32) | ls_low[0];
        barrier_buf_ea = ls_base + (unsigned)&barrier_buf[1][0];
    }
    barrier_buf_eah = barrier_buf_ea >> 32;
    barrier_buf_eal = (unsigned)barrier_buf_ea;

}



static void
barrier(int speid)
{
    if (speid == 0) {
        // wait SPE1
        while (barrier_buf[1][0] == 0)
            ;
        barrier_buf[1][0] = 0;
        // release
        spu_mfcdma64(&barrier_buf[0][0], barrier_buf_eah, barrier_buf_eal,
                     16, TAGID, MFC_PUT_CMD);
    } else {
        // join
        spu_mfcdma64(&barrier_buf[1][0], barrier_buf_eah, barrier_buf_eal,
                     16, TAGID, MFC_PUT_CMD);
        // wait SPE0
        while (barrier_buf[0][0] == 0)
            ;
        barrier_buf[0][0] = 0;
    }
}


#define SET_EA(HIGH, LOW, EA)                   \
    HIGH = ((EA) >> 32);                        \
    LOW = (unsigned)(EA);                       \

unsigned int
genrand_mine(int num_rand)
{
    int i;

    // դüǤȤʤ
    const vec_uchar16 vshuf = (vec_uchar16){4, 5, 6, 7, 8, 9, 10, 11,
                                            12, 13, 14, 15, 16, 17, 18, 19};
    const vec_uint4 vone = spu_splats(0x1U);
    const vec_uint4 vsel = spu_splats(0x7fffffffU);

    vec_uint4 *vmt = (vec_uint4 *)mt;

    vec_uint4 v1a, v1b, v2a, v2b, v1, v2;
    vec_uint4 vmag01;
    vec_uint4 vsum = spu_splats(0U);
    vec_uint4 vy, vy2;
    int k, k1, k2, k3;

    int loop1 = num_rand / 624;
    int loop2 = num_rand - loop1 * 624;

    // DMAΥˤTAGIDȤʤΤǤ
    spu_writech(MFC_WrTagMask, 1 << TAGID);

    // Ʊѿν
    barrier_init(speid);

    if (speid == 0) {
        unsigned long long ls_ea, ls_ea_mt;
        // SPE1LSEA
        ls_ea = ((unsigned long long)ls_high[1] << 32) | ls_low[1];

        // SPE1wpEA
        SET_EA(wp_ea_high, wp_ea_low, ls_ea + (unsigned)&wp);

        // SPE1mtγΰEA
        ls_ea_mt = ls_ea + (unsigned)mt;
        for (i = 0; i < BUFFERS; i++) {
            // ȾEAƳ
            SET_EA(mt_ea_high[i][0], mt_ea_low[i][0], ls_ea_mt);
            ls_ea_mt += sizeof(mt[0]) * 304;
            // ȾEAƳ
            SET_EA(mt_ea_high[i][1], mt_ea_low[i][1], ls_ea_mt);
            ls_ea_mt += sizeof(mt[0]) * 320;
        }
    } else {
        unsigned long long ls_ea;
        // SPE0LSEA
        ls_ea = ((unsigned long long)ls_high[0] << 32) | ls_low[0];

        // SPE0rpEA (ɤޤɤ)
        SET_EA(rp_ea_high, rp_ea_low, ls_ea + (unsigned)&rp);
    }

    wp = 0;
    rp = 0;

    // Ʊ
    barrier(speid);

    // 롼
    if (speid == 0)
        genrand_mine_update(loop1);
    else
        vsum = genrand_mine_tempering(loop1);

    // ü
    if (speid == 1) {
        k1 = 1;
        k2 = 99;
        k3 = 100;

        // rp - 1Υ֥å
        rp -= 1;
        if (rp < 0)
            rp += BUFFERS;

        vmt = (vec_uint4 *)(mt + 624 * rp);

        for (k = 0; k < loop2 / 4; k++) {
            v1a = vmt[k];
            v1b = vmt[k1];
            v2a = vmt[k2];
            v2b = vmt[k3];

            v1 = spu_shuffle(v1a, v1b, vshuf);
            v2 = spu_shuffle(v2a, v2b, vshuf);
            vy = spu_sel(v1a, v1, vsel);
            vmag01 = spu_and(spu_cmpeq(spu_and(vy, vone), vone), 
                                0x9908b0dfU);
            vy2 = spu_xor(spu_xor(v2, spu_rlmask(vy, -1)), vmag01);

            vmt[k] = vy2;
            vy2 = spu_xor(vy2, spu_rlmask(vy2, -11));
            vy2 = spu_xor(vy2, spu_and(spu_sl(vy2, 7), 0x9d2c5680U));
            vy2 = spu_xor(vy2, spu_and(spu_sl(vy2, 15), 0xefc60000U));
            vy2 = spu_xor(vy2, spu_rlmask(vy2, -18));
            vsum = spu_add(vsum, vy2);

            k1++;
            if (k1 == 156)
                k1 = 0;
            k2++;
            if (k2 == 156)
                k2 = 0;
            k3++;
            if (k3 == 156)
                k3 = 0;
        }

        vsum = spu_add(vsum, spu_rlqwbyte(vsum, 8));
        vsum = spu_add(vsum, spu_rlqwbyte(vsum, 4));
    }

    // Ǹ˺Ʊ
    barrier(speid);

    return spu_extract(vsum, 0);
}
