Content - cccf9c453542a27637fa122c7cbf7c51582b21d7 - 219c119/NativeMIPS.cpp

NativeMIPS.cpp
/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
/* vi: set ts=4 sw=4 expandtab: (add to ~/.vimrc: set modeline modelines=5) */
/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is [Open Source Virtual Machine].
 *
 * The Initial Developer of the Original Code is
 * MIPS Technologies Inc
 * Portions created by the Initial Developer are Copyright (C) 2009
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *   Chris Dearman <chris@mips.com>
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

#include "nanojit.h"

#if defined FEATURE_NANOJIT && defined NANOJIT_MIPS

namespace nanojit
{
#ifdef NJ_VERBOSE
    const char *regNames[] = {
        "$zr", "$at", "$v0", "$v1", "$a0", "$a1", "$a2", "$a3",
        "$t0", "$t1", "$t2", "$t3", "$t4", "$t5", "$t6", "$t7",
        "$s0", "$s1", "$s2", "$s3", "$s4", "$s5", "$s6", "$s7",
        "$t8", "$t9", "$k0", "$k1", "$gp", "$sp", "$fp", "$ra",

        "$f0",  "$f1",  "$f2",  "$f3",  "$f4",  "$f5",  "$f6",  "$f7",
        "$f8",  "$f9",  "$f10", "$f11", "$f12", "$f13", "$f14", "$f15",
        "$f16", "$f17", "$f18", "$f19", "$f20", "$f21", "$f22", "$f23",
        "$f24", "$f25", "$f26", "$f27", "$f28", "$f29", "$f30", "$f31"
    };

    const char *cname[16] = {
        "f",    "un",   "eq",   "ueq",
        "olt",  "ult",  "ole",  "ule",
        "sf",   "ngle", "seq",  "ngl",
        "lt",   "nge",  "le",   "ngt"
    };

    const char *fname[32] = {
        "resv", "resv", "resv", "resv",
        "resv", "resv", "resv", "resv",
        "resv", "resv", "resv", "resv",
        "resv", "resv", "resv", "resv",
        "s",    "d",    "resv", "resv",
        "w",    "l",    "ps",   "resv",
        "resv", "resv", "resv", "resv",
        "resv", "resv", "resv", "resv",
    };

    const char *oname[64] = {
        "special", "regimm", "j",    "jal",   "beq",      "bne",  "blez",  "bgtz",
        "addi",    "addiu",  "slti", "sltiu", "andi",     "ori",  "xori",  "lui",
        "cop0",    "cop1",   "cop2", "cop1x", "beql",     "bnel", "blezl", "bgtzl",
        "resv",    "resv",   "resv", "resv",  "special2", "jalx", "resv",  "special3",
        "lb",      "lh",     "lwl",  "lw",    "lbu",      "lhu",  "lwr",   "resv",
        "sb",      "sh",     "swl",  "sw",    "resv",     "resv", "swr",   "cache",
        "ll",      "lwc1",   "lwc2", "pref",  "resv",     "ldc1", "ldc2",  "resv",
        "sc",      "swc1",   "swc2", "resv",  "resv",     "sdc1", "sdc2",  "resv",
    };
#endif

    const Register Assembler::argRegs[] = { A0, A1, A2, A3 };
    const Register Assembler::retRegs[] = { V0, V1 };
    const Register Assembler::savedRegs[] = {
        S0, S1, S2, S3, S4, S5, S6, S7,
#ifdef FPCALLEESAVED
        FS0, FS1, FS2, FS3, FS4, FS5
#endif
    };

#define USE(x) (void)x
#define BADOPCODE(op) NanoAssertMsgf(false, "unexpected opcode %s", lirNames[op])

    // This function will get will get optimised by the compiler into a known value
    static inline bool isLittleEndian(void)
    {
        const union {
            uint32_t      ival;
            unsigned char cval[4];
        } u = { 1 };
        return u.cval[0] == 1;
    }

    // offsets to most/least significant parts of 64bit data in memory
    // These functions will get optimised by the compiler into a known value
    static inline int mswoff(void) {
        return isLittleEndian() ? 4 : 0;
    }

    static inline int lswoff(void) {
        return isLittleEndian() ? 0 : 4;
    }

    static inline Register mswregpair(Register r) {
        return Register(r + (isLittleEndian() ? 1 : 0));
    }

    static inline Register lswregpair(Register r) {
        return Register(r + (isLittleEndian() ? 0 : 1));
    }

// These variables affect the code generator
// They can be defined as constants and the compiler will remove
// the unused paths through dead code elimination
// Alternatively they can be defined as variables which will allow
// the exact code generated to be determined at runtime
//
//  cpu_has_fpu        CPU has fpu
//  cpu_has_movn       CPU has movn
//  cpu_has_cmov       CPU has movf/movn instructions
//  cpu_has_lsdc1      CPU has ldc1/sdc1 instructions
//  cpu_has_lsxdc1     CPU has ldxc1/sdxc1 instructions
//  cpu_has_fpuhazard  hazard between c.xx.xx & bc1[tf]
//
// Currently the values are initialised bases on preprocessor definitions

#ifdef DEBUG
    // Don't allow the compiler to eliminate dead code for debug builds
    #define _CONST
#else
    #define _CONST const
#endif

#if NJ_SOFTFLOAT_SUPPORTED
    _CONST bool cpu_has_fpu = false;
#else
    _CONST bool cpu_has_fpu = true;
#endif

#if (__mips==4 || __mips==32 || __mips==64)
    _CONST bool cpu_has_cmov = true;
#else
    _CONST bool cpu_has_cmov = false;
#endif

#if __mips != 1
    _CONST bool cpu_has_lsdc1 = true;
#else
    _CONST bool cpu_has_lsdc1 = false;
#endif

#if (__mips==32 || __mips==64) && __mips_isa_rev>=2
    _CONST bool cpu_has_lsdxc1 = true;
#else
    _CONST bool cpu_has_lsdxc1 = false;
#endif

#if (__mips==1 || __mips==2 || __mips==3)
    _CONST bool cpu_has_fpuhazard = true;
#else
    _CONST bool cpu_has_fpuhazard = false;
#endif
#undef _CONST

    /* Support routines */

    debug_only (
                // break to debugger when generating code to this address
                static NIns *breakAddr;
                static void codegenBreak(NIns *genAddr)
                {
                    NanoAssert (breakAddr != genAddr);
                }
    )

    // Equivalent to assembler %hi(), %lo()
    uint16_t hi(uint32_t v)
    {
        uint16_t r = v >> 16;
        if ((int16_t)(v) < 0)
            r += 1;
        return r;
    }

    int16_t lo(uint32_t v)
    {
        int16_t r = v;
        return r;
    }

    void Assembler::asm_li32(Register r, int32_t imm)
    {
        // general case generating a full 32-bit load
        ADDIU(r, r, lo(imm));
        LUI(r, hi(imm));
    }

    void Assembler::asm_li(Register r, int32_t imm)
    {
#if !PEDANTIC
        if (isU16(imm)) {
            ORI(r, ZERO, imm);
            return;
        }
        if (isS16(imm)) {
            ADDIU(r, ZERO, imm);
            return;
        }
        if ((imm & 0xffff) == 0) {
            LUI(r, uint32_t(imm) >> 16);
            return;
        }
#endif
        asm_li32(r, imm);
    }

    // 64 bit immediate load to a register pair
    void Assembler::asm_li_d(Register r, int32_t msw, int32_t lsw)
    {
        if (IsFpReg(r)) {
            NanoAssert(cpu_has_fpu);
            // li   $at,lsw         # iff lsw != 0
            // mtc1 $at,$r          # may use $0 instead of $at
            // li   $at,msw         # iff (msw != 0) && (msw != lsw)
            // mtc1 $at,$(r+1)      # may use $0 instead of $at
            if (msw == 0)
                MTC1(ZERO, r+1);
            else {
                MTC1(AT, r+1);
                // If the MSW & LSW values are different, reload AT
                if (msw != lsw)
                    asm_li(AT, msw);
            }
            if (lsw == 0)
                MTC1(ZERO, r);
            else {
                MTC1(AT, r);
                asm_li(AT, lsw);
            }
        }
        else {
            /*
             * li $r.lo, lsw
             * li $r.hi, msw   # will be converted to move $f.hi,$f.lo if (msw==lsw)
             */
            if (msw == lsw)
                MOVE(mswregpair(r), lswregpair(r));
            else
                asm_li(mswregpair(r), msw);
            asm_li(lswregpair(r), lsw);
        }
    }

    void Assembler::asm_move(Register d, Register s)
    {
        MOVE(d, s);
    }

    // General load/store operation
    void Assembler::asm_ldst(int op, Register rt, int dr, Register rbase)
    {
#if !PEDANTIC
        if (isS16(dr)) {
            LDST(op, rt, dr, rbase);
            return;
        }
#endif

        // lui AT,hi(d)
        // addu AT,rbase
        // ldst rt,lo(d)(AT)
        LDST(op, rt, lo(dr), AT);
        ADDU(AT, AT, rbase);
        LUI(AT, hi(dr));
    }

    void Assembler::asm_ldst64(bool store, Register r, int dr, Register rbase)
    {
#if !PEDANTIC
        if (isS16(dr) && isS16(dr+4)) {
            if (IsGpReg(r)) {
                LDST(store ? OP_SW : OP_LW, r+1, dr+4, rbase);
                LDST(store ? OP_SW : OP_LW, r,   dr, rbase);
            }
            else {
                NanoAssert(cpu_has_fpu);
                // NanoAssert((dr & 7) == 0);
                if (cpu_has_lsdc1 && ((dr & 7) == 0)) {
                    // lsdc1 $fr,dr($rbase)
                    LDST(store ? OP_SDC1 : OP_LDC1, r, dr, rbase);
                }
                else {
                    // lswc1 $fr,  dr+LSWOFF($rbase)
                    // lswc1 $fr+1,dr+MSWOFF($rbase)
                    LDST(store ? OP_SWC1 : OP_LWC1, r+1, dr+mswoff(), rbase);
                    LDST(store ? OP_SWC1 : OP_LWC1, r,   dr+lswoff(), rbase);
                }
                return;
            }
        }
#endif

        if (IsGpReg(r)) {
            // lui   $at,%hi(d)
            // addu  $at,$rbase
            // ldsw  $r,  %lo(d)($at)
            // ldst  $r+1,%lo(d+4)($at)
            LDST(store ? OP_SW : OP_LW, r+1, lo(dr+4), AT);
            LDST(store ? OP_SW : OP_LW, r,   lo(dr), AT);
            ADDU(AT, AT, rbase);
            LUI(AT, hi(dr));
        }
        else {
            NanoAssert(cpu_has_fpu);
            if (cpu_has_lsdxc1) {
                // li     $at,dr
                // lsdcx1 $r,$at($rbase)
                if (store)
                    SDXC1(r, AT, rbase);
                else
                    LDXC1(r, AT, rbase);
                asm_li(AT, dr);
            }
            else if (cpu_has_lsdc1) {
                // lui    $at,%hi(dr)
                // addu   $at,$rbase
                // lsdc1  $r,%lo(dr)($at)
                LDST(store ? OP_SDC1 : OP_LDC1, r, lo(dr), AT);
                ADDU(AT, AT, rbase);
                LUI(AT, hi(dr));
            }
            else {
                // lui   $at,%hi(d)
                // addu  $at,$rbase
                // lswc1 $r,  %lo(d+LSWOFF)($at)
                // lswc1 $r+1,%lo(d+MSWOFF)($at)
                LDST(store ? OP_SWC1 : OP_LWC1, r+1, lo(dr+mswoff()), AT);
                LDST(store ? OP_SWC1 : OP_LWC1, r,   lo(dr+lswoff()), AT);
                ADDU(AT, AT, rbase);
                LUI(AT, hi(dr));
            }
        }
    }

    void Assembler::asm_store_imm64(LIns *value, int dr, Register rbase)
    {
        NanoAssert(value->isImmD());
        int32_t msw = value->immDhi();
        int32_t lsw = value->immDlo();

        // li $at,lsw                   # iff lsw != 0
        // sw $at,off+LSWOFF($rbase)    # may use $0 instead of $at
        // li $at,msw                   # iff (msw != 0) && (msw != lsw)
        // sw $at,off+MSWOFF($rbase)    # may use $0 instead of $at

        NanoAssert(isS16(dr) && isS16(dr+4));

        if (lsw == 0)
            SW(ZERO, dr+lswoff(), rbase);
        else {
            SW(AT, dr+lswoff(), rbase);
            if (msw != lsw)
                asm_li(AT, lsw);
        }
        if (msw == 0)
            SW(ZERO, dr+mswoff(), rbase);
        else {
            SW(AT, dr+mswoff(), rbase);
            // If the MSW & LSW values are different, reload AT
            if (msw != lsw)
                asm_li(AT, msw);
        }
    }

    void Assembler::asm_regarg(ArgType ty, LIns* p, Register r)
    {
        NanoAssert(deprecated_isKnownReg(r));
        if (ty == ARGTYPE_I || ty == ARGTYPE_UI) {
            // arg goes in specific register
            if (p->isImmI())
                asm_li(r, p->immI());
            else {
                if (p->isExtant()) {
                    if (!p->deprecated_hasKnownReg()) {
                        // load it into the arg reg
                        int d = findMemFor(p);
                        if (p->isop(LIR_allocp))
                            ADDIU(r, FP, d);
                        else
                            asm_ldst(OP_LW, r, d, FP);
                    }
                    else
                        // it must be in a saved reg
                        MOVE(r, p->deprecated_getReg());
                }
                else {
                    // this is the last use, so fine to assign it
                    // to the scratch reg, it's dead after this point.
                    findSpecificRegFor(p, r);
                }
            }
        }
        else {
            // Other argument types unsupported
            NanoAssert(false);
        }
    }

    void Assembler::asm_stkarg(LIns* arg, int stkd)
    {
        bool isF64 = arg->isD();
        Register rr;
        if (arg->isExtant() && (rr = arg->deprecated_getReg(), deprecated_isKnownReg(rr))) {
            // The argument resides somewhere in registers, so we simply need to
            // push it onto the stack.
            if (!cpu_has_fpu || !isF64) {
                NanoAssert(IsGpReg(rr));
                SW(rr, stkd, SP);
            }
            else {
                NanoAssert(cpu_has_fpu);
                NanoAssert(IsFpReg(rr));
                NanoAssert((stkd & 7) == 0);
                asm_ldst64(true, rr, stkd, SP);
            }
        }
        else {
            // The argument does not reside in registers, so we need to get some
            // memory for it and then copy it onto the stack.
            int d = findMemFor(arg);
            if (!isF64) {
                SW(AT, stkd, SP);
                if (arg->isop(LIR_allocp))
                    ADDIU(AT, FP, d);
                else
                    LW(AT, d, FP);
            }
            else {
                NanoAssert((stkd & 7) == 0);
                SW(AT, stkd+4, SP);
                LW(AT, d+4,    FP);
                SW(AT, stkd,   SP);
                LW(AT, d,      FP);
            }
        }
    }

    // Encode a 64-bit floating-point argument using the appropriate ABI.
    // This function operates in the same way as asm_arg, except that it will only
    // handle arguments where (ArgType)ty == ARGTYPE_D.
    void
    Assembler::asm_arg_64(LIns* arg, Register& r, Register& fr, int& stkd)
    {
        // The stack offset always be at least aligned to 4 bytes.
        NanoAssert((stkd & 3) == 0);
#if NJ_SOFTFLOAT_SUPPORTED
        NanoAssert(arg->isop(LIR_ii2d));
#else
        NanoAssert(cpu_has_fpu);
#endif

        // O32 ABI requires that 64-bit arguments are aligned on even-numbered
        // registers, as A0:A1/FA0 or A2:A3/FA1. Use the stack offset to keep track
        // where we are
        if (stkd & 4) {
            if (stkd < 16) {
                r = Register(r + 1);
                fr = Register(fr + 1);
            }
            stkd += 4;
        }

        if (stkd < 16) {
            NanoAssert(fr == FA0 || fr == FA1 || fr == A2);
            if (fr == FA0 || fr == FA1)
                findSpecificRegFor(arg, fr);
            else {
                findSpecificRegFor(arg, FA1);
                // Move it to the integer pair
                Register fpupair = arg->getReg();
                Register intpair = fr;
                MFC1(mswregpair(intpair), Register(fpupair + 1));  // Odd fpu register contains sign,expt,manthi
                MFC1(lswregpair(intpair), fpupair);                // Even fpu register contains mantlo
            }
            r = Register(r + 2);
            fr = Register(fr + 2);
        }
        else
            asm_stkarg(arg, stkd);

        stkd += 8;
    }

    /* Required functions */

#define FRAMESIZE        8
#define RA_OFFSET        4
#define FP_OFFSET        0

    void Assembler::asm_store32(LOpcode op, LIns *value, int dr, LIns *base)
    {
        Register rt, rbase;
        getBaseReg2(GpRegs, value, rt, GpRegs, base, rbase, dr);

        switch (op) {
        case LIR_sti:
            asm_ldst(OP_SW, rt, dr, rbase);
            break;
        case LIR_sti2s:
            asm_ldst(OP_SH, rt, dr, rbase);
            break;
        case LIR_sti2c:
            asm_ldst(OP_SB, rt, dr, rbase);
            break;
        default:
            BADOPCODE(op);
        }

        TAG("asm_store32(value=%p{%s}, dr=%d, base=%p{%s})",
            value, lirNames[value->opcode()], dr, base, lirNames[base->opcode()]);
    }

    void Assembler::asm_ui2d(LIns *ins)
    {
        Register fr = deprecated_prepResultReg(ins, FpRegs);
        Register v = findRegFor(ins->oprnd1(), GpRegs);
        Register ft = registerAllocTmp(FpRegs & ~(rmask(fr)));    // allocate temporary register for constant

        // todo: support int value in memory, as per x86
        NanoAssert(deprecated_isKnownReg(v));

        // mtc1       $v,$ft
        // bgez       $v,1f
        //  cvt.d.w $fr,$ft
        // lui       $at,0x41f0    # (double)0x10000000LL = 0x41f0000000000000
        // mtc1    $0,$ft
        // mtc1    $at,$ft+1
        // add.d   $fr,$fr,$ft
        // 1:

        underrunProtect(6*4);   // keep branch and destination together
        NIns *here = _nIns;
        ADD_D(fr,fr,ft);
        MTC1(AT,ft+1);
        MTC1(ZERO,ft);
        LUI(AT,0x41f0);
        CVT_D_W(fr,ft);            // branch delay slot
        BGEZ(v,here);
        MTC1(v,ft);

        TAG("asm_ui2d(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

    void Assembler::asm_d2i(LIns* ins)
    {
        NanoAssert(cpu_has_fpu);

        Register rr = deprecated_prepResultReg(ins, GpRegs);
        Register sr = findRegFor(ins->oprnd1(), FpRegs);
        // trunc.w.d $sr,$sr
        // mfc1 $rr,$sr
        MFC1(rr,sr);
        TRUNC_W_D(sr,sr);
        TAG("asm_d2i(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

    void Assembler::asm_fop(LIns *ins)
    {
        NanoAssert(cpu_has_fpu);
        if (cpu_has_fpu) {
            LIns* lhs = ins->oprnd1();
            LIns* rhs = ins->oprnd2();
            LOpcode op = ins->opcode();

            // rr = ra OP rb

            Register rr = deprecated_prepResultReg(ins, FpRegs);
            Register ra = findRegFor(lhs, FpRegs);
            Register rb = (rhs == lhs) ? ra : findRegFor(rhs, FpRegs & ~rmask(ra));

            switch (op) {
            case LIR_addd: ADD_D(rr, ra, rb); break;
            case LIR_subd: SUB_D(rr, ra, rb); break;
            case LIR_muld: MUL_D(rr, ra, rb); break;
            case LIR_divd: DIV_D(rr, ra, rb); break;
            default:
                BADOPCODE(op);
            }
        }
        TAG("asm_fop(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

    void Assembler::asm_fneg(LIns *ins)
    {
        NanoAssert(cpu_has_fpu);
        if (cpu_has_fpu) {
            LIns* lhs = ins->oprnd1();
            Register rr = deprecated_prepResultReg(ins, FpRegs);
            Register sr = ( !lhs->isInReg()
                            ? findRegFor(lhs, FpRegs)
                            : lhs->deprecated_getReg() );
            NEG_D(rr, sr);
        }
        TAG("asm_fneg(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

    void Assembler::asm_immd(LIns *ins)
    {
        int d = deprecated_disp(ins);
        Register rr = ins->deprecated_getReg();

        deprecated_freeRsrcOf(ins);

        if (cpu_has_fpu && deprecated_isKnownReg(rr)) {
            if (d)
                asm_spill(rr, d, true);
            asm_li_d(rr, ins->immDhi(), ins->immDlo());
        }
        else {
            NanoAssert(d);
            asm_store_imm64(ins, d, FP);
        }
        TAG("asm_immd(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

#ifdef NANOJIT_64BIT
    void
    Assembler::asm_q2i(LIns *)
    {
        NanoAssert(0);  // q2i shouldn't occur on 32-bit platforms
    }

    void Assembler::asm_ui2uq(LIns *ins)
    {
        USE(ins);
        TODO(asm_ui2uq);
        TAG("asm_ui2uq(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }
#endif

    void Assembler::asm_load64(LIns *ins)
    {
        NanoAssert(ins->isD());

        LIns* base = ins->oprnd1();
        int dr = ins->disp();

        Register rd = ins->deprecated_getReg();
        int ds = deprecated_disp(ins);

        Register rbase = findRegFor(base, GpRegs);
        NanoAssert(IsGpReg(rbase));
        deprecated_freeRsrcOf(ins);

        if (cpu_has_fpu && deprecated_isKnownReg(rd)) {
            NanoAssert(IsFpReg(rd));
            asm_ldst64 (false, rd, dr, rbase);
        }
        else {
            // Either FPU is not available or the result needs to go into memory;
            // in either case, FPU instructions are not required. Note that the
            // result will never be loaded into registers if FPU is not available.
            NanoAssert(!deprecated_isKnownReg(rd));
            NanoAssert(ds != 0);

            NanoAssert(isS16(dr) && isS16(dr+4));
            NanoAssert(isS16(ds) && isS16(ds+4));

            // Check that the offset is 8-byte (64-bit) aligned.
            NanoAssert((ds & 0x7) == 0);

            // FIXME: allocate a temporary to use for the copy
            // to avoid load to use delay
            // lw $at,dr($rbase)
            // sw $at,ds($fp)
            // lw $at,dr+4($rbase)
            // sw $at,ds+4($fp)

            SW(AT, ds+4, FP);
            LW(AT, dr+4, rbase);
            SW(AT, ds,   FP);
            LW(AT, dr,   rbase);
        }

        TAG("asm_load64(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

    void Assembler::asm_cond(LIns *ins)
    {
        Register r = deprecated_prepResultReg(ins, GpRegs);
        LOpcode op = ins->opcode();
        LIns *a = ins->oprnd1();
        LIns *b = ins->oprnd2();

        asm_cmp(op, a, b, r);

        TAG("asm_cond(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

#if NJ_SOFTFLOAT_SUPPORTED
    void Assembler::asm_qhi(LIns *ins)
    {
        Register rr = deprecated_prepResultReg(ins, GpRegs);
        LIns *q = ins->oprnd1();
        int d = findMemFor(q);
        LW(rr, d+mswoff(), FP);
        TAG("asm_qhi(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

    void Assembler::asm_qlo(LIns *ins)
    {
        Register rr = deprecated_prepResultReg(ins, GpRegs);
        LIns *q = ins->oprnd1();
        int d = findMemFor(q);
        LW(rr, d+lswoff(), FP);
        TAG("asm_qlo(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

    void Assembler::asm_qjoin(LIns *ins)
    {
        int d = findMemFor(ins);
        NanoAssert(d && isS16(d));
        LIns* lo = ins->oprnd1();
        LIns* hi = ins->oprnd2();

        Register r = findRegFor(hi, GpRegs);
        SW(r, d+mswoff(), FP);
        r = findRegFor(lo, GpRegs);             // okay if r gets recycled.
        SW(r, d+lswoff(), FP);
        deprecated_freeRsrcOf(ins);             // if we had a reg in use, flush it to mem

        TAG("asm_qjoin(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

#endif

    void Assembler::asm_neg_not(LIns *ins)
    {
        LOpcode op = ins->opcode();
        Register rr = deprecated_prepResultReg(ins, GpRegs);

        LIns* lhs = ins->oprnd1();
        // If this is the last use of lhs in reg, we can re-use result reg.
        // Else, lhs already has a register assigned.
        Register ra = !lhs->isInReg() ? findSpecificRegFor(lhs, rr) : lhs->deprecated_getReg();
        if (op == LIR_noti)
            NOT(rr, ra);
        else
            NEGU(rr, ra);
        TAG("asm_neg_not(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

    void Assembler::asm_immi(LIns *ins)
    {
        Register rr = deprecated_prepResultReg(ins, GpRegs);
        asm_li(rr, ins->immI());
        TAG("asm_immi(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

    void Assembler::asm_cmov(LIns *ins)
    {
        LIns* condval = ins->oprnd1();
        LIns* iftrue  = ins->oprnd2();
        LIns* iffalse = ins->oprnd3();

        NanoAssert(condval->isCmp());
        NanoAssert(ins->opcode() == LIR_cmovi && iftrue->isI() && iffalse->isI());

        const Register rr = deprecated_prepResultReg(ins, GpRegs);

        const Register iftruereg = findRegFor(iftrue, GpRegs & ~rmask(rr));
        MOVN(rr, iftruereg, AT);
        /*const Register iffalsereg =*/ findSpecificRegFor(iffalse, rr);
        asm_cmp(condval->opcode(), condval->oprnd1(), condval->oprnd2(), AT);
        TAG("asm_cmov(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

    void Assembler::asm_condd(LIns *ins)
    {
        NanoAssert(cpu_has_fpu);
        if (cpu_has_fpu) {
            Register r = deprecated_prepResultReg(ins, GpRegs);
            LOpcode op = ins->opcode();
            LIns *a = ins->oprnd1();
            LIns *b = ins->oprnd2();

            if (cpu_has_cmov) {
                // c.xx.d  $a,$b
                // li      $r,1
                // movf    $r,$0,$fcc0
                MOVF(r, ZERO, 0);
                ORI(r, ZERO, 1);
            }
            else {
                // c.xx.d  $a,$b
                // [nop]
                // bc1t    1f
                //  li      $r,1
                // move    $r,$0
                // 1:
                NIns *here = _nIns;
                verbose_only(verbose_outputf("%p:", here);)
                underrunProtect(3*4);
                MOVE(r, ZERO);
                ORI(r, ZERO, 1);        // branch delay slot
                BC1T(here);
                if (cpu_has_fpuhazard)
                    NOP();
            }
            asm_cmp(op, a, b, r);
        }
        TAG("asm_condd(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

    void Assembler::asm_i2d(LIns *ins)
    {
        NanoAssert(cpu_has_fpu);
        if (cpu_has_fpu) {
            Register fr = deprecated_prepResultReg(ins, FpRegs);
            Register v = findRegFor(ins->oprnd1(), GpRegs);

            // mtc1    $v,$fr
            // cvt.d.w $fr,$fr
            CVT_D_W(fr,fr);
            MTC1(v,fr);
        }
        TAG("asm_i2d(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

    void Assembler::asm_ret(LIns *ins)
    {
        genEpilogue();

        releaseRegisters();
        assignSavedRegs();

        LIns *value = ins->oprnd1();
        if (ins->isop(LIR_reti)) {
            findSpecificRegFor(value, V0);
        }
        else {
            NanoAssert(ins->isop(LIR_retd));
#if NJ_SOFTFLOAT_SUPPORTED
            NanoAssert(value->isop(LIR_ii2d));
            findSpecificRegFor(value->oprnd1(), V0); // lo
            findSpecificRegFor(value->oprnd2(), V1); // hi
#else
            findSpecificRegFor(value, FV0);
#endif
        }
        TAG("asm_ret(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

    void Assembler::asm_load32(LIns *ins)
    {
        LOpcode op = ins->opcode();
        LIns* base = ins->oprnd1();
        int d = ins->disp();

        Register rres = deprecated_prepResultReg(ins, GpRegs);
        Register rbase = getBaseReg(base, d, GpRegs);

        switch (op) {
        case LIR_lduc2ui:          // 8-bit integer load, zero-extend to 32-bit
            asm_ldst(OP_LBU, rres, d, rbase);
            break;
        case LIR_ldus2ui:          // 16-bit integer load, zero-extend to 32-bit
            asm_ldst(OP_LHU, rres, d, rbase);
            break;
        case LIR_ldc2i:          // 8-bit integer load, sign-extend to 32-bit
            asm_ldst(OP_LB, rres, d, rbase);
            break;
        case LIR_lds2i:          // 16-bit integer load, sign-extend to 32-bit
            asm_ldst(OP_LH, rres, d, rbase);
            break;
        case LIR_ldi:            // 32-bit integer load
            asm_ldst(OP_LW, rres, d, rbase);
            break;
        default:
            BADOPCODE(op);
        }

        TAG("asm_load32(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

    void Assembler::asm_param(LIns *ins)
    {
        uint32_t a = ins->paramArg();
        uint32_t kind = ins->paramKind();

        if (kind == 0) {
            // ordinary param
            // first 4 args A0..A3
            if (a < 4) {
                // incoming arg in register
                deprecated_prepResultReg(ins, rmask(argRegs[a]));
            } else {
                // incoming arg is on stack
                Register r = deprecated_prepResultReg(ins, GpRegs);
                TODO(Check stack offset);
                int d = FRAMESIZE + a * sizeof(intptr_t);
                LW(r, d, FP);
            }
        }
        else {
            // saved param
            deprecated_prepResultReg(ins, rmask(savedRegs[a]));
        }
        TAG("asm_param(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

    void Assembler::asm_arith(LIns *ins)
    {
        LOpcode op = ins->opcode();
        LIns* lhs = ins->oprnd1();
        LIns* rhs = ins->oprnd2();

        RegisterMask allow = GpRegs;

        // We always need the result register and the first operand register.
        Register rr = deprecated_prepResultReg(ins, allow);

        // If this is the last use of lhs in reg, we can re-use the result reg.
        // Else, lhs already has a register assigned.
        Register ra = !lhs->isInReg() ? findSpecificRegFor(lhs, rr) : lhs->deprecated_getReg();
        Register rb, t;

        // Don't re-use the registers we've already allocated.
        NanoAssert(deprecated_isKnownReg(rr));
        NanoAssert(deprecated_isKnownReg(ra));
        allow &= ~rmask(rr);
        allow &= ~rmask(ra);

        if (rhs->isImmI()) {
            int32_t rhsc = rhs->immI();
            if (isS16(rhsc)) {
                // MIPS arith immediate ops sign-extend the imm16 value
                switch (op) {
                case LIR_addxovi:
                case LIR_addjovi:
                    // add with overflow result into $at
                    // overflow is indicated by ((sign(rr)^sign(ra)) & (sign(rr)^sign(rhsc))

                    // [move $t,$ra]            if (rr==ra)
                    // addiu $rr,$ra,rhsc
                    // [xor  $at,$rr,$ra]       if (rr!=ra)
                    // [xor  $at,$rr,$t]        if (rr==ra)
                    // [not  $t,$rr]            if (rhsc < 0)
                    // [and  $at,$at,$t]        if (rhsc < 0)
                    // [and  $at,$at,$rr]       if (rhsc >= 0)
                    // srl   $at,$at,31

                    t = registerAllocTmp(allow);
                    SRL(AT, AT, 31);
                    if (rhsc < 0) {
                        AND(AT, AT, t);
                        NOT(t, rr);
                    }
                    else
                        AND(AT, AT, rr);
                    if (rr == ra)
                        XOR(AT, rr, t);
                    else
                        XOR(AT, rr, ra);
                    ADDIU(rr, ra, rhsc);
                    if (rr == ra)
                        MOVE(t, ra);
                    goto done;
                case LIR_addi:
                    ADDIU(rr, ra, rhsc);
                    goto done;
                case LIR_subxovi:
                case LIR_subjovi:
                    // subtract with overflow result into $at
                    // overflow is indicated by (sign(ra)^sign(rhsc)) & (sign(rr)^sign(ra))

                    // [move $t,$ra]            if (rr==ra)
                    // addiu $rr,$ra,-rhsc
                    // [xor  $at,$rr,$ra]       if (rr!=ra)
                    // [xor  $at,$rr,$t]        if (rr==ra)
                    // [and  $at,$at,$ra]       if (rhsc >= 0 && rr!=ra)
                    // [and  $at,$at,$t]        if (rhsc >= 0 && rr==ra)
                    // [not  $t,$ra]            if (rhsc < 0 && rr!=ra)
                    // [not  $t,$t]             if (rhsc < 0 && rr==ra)
                    // [and  $at,$at,$t]        if (rhsc < 0)
                    // srl   $at,$at,31
                    if (isS16(-rhsc)) {
                        t = registerAllocTmp(allow);
                        SRL(AT,AT,31);
                        if (rhsc < 0) {
                            AND(AT, AT, t);
                            if (rr == ra)
                                NOT(t, t);
                            else
                                NOT(t, ra);
                        }
                        else {
                            if (rr == ra)
                                AND(AT, AT, t);
                            else
                                AND(AT, AT, ra);
                        }
                        if (rr == ra)
                            XOR(AT, rr, t);
                        else
                            XOR(AT, rr, ra);
                        ADDIU(rr, ra, -rhsc);
                        if (rr == ra)
                            MOVE(t, ra);
                        goto done;
                    }
                    break;
                case LIR_subi:
                    if (isS16(-rhsc)) {
                        ADDIU(rr, ra, -rhsc);
                        goto done;
                    }
                    break;
                case LIR_mulxovi:
                case LIR_muljovi:
                case LIR_muli:
                    // FIXME: optimise constant multiply by 2^n
                    // if ((rhsc & (rhsc-1)) == 0)
                    //    SLL(rr, ra, ffs(rhsc)-1);
                    //goto done;
                    break;
                default:
                    break;
                }
            }
            if (isU16(rhsc)) {
                // MIPS logical immediate zero-extend the imm16 value
                switch (op) {
                case LIR_ori:
                    ORI(rr, ra, rhsc);
                    goto done;
                case LIR_andi:
                    ANDI(rr, ra, rhsc);
                    goto done;
                case LIR_xori:
                    XORI(rr, ra, rhsc);
                    goto done;
                default:
                    break;
                }
            }

            // LIR shift ops only use last 5bits of shift const
            switch (op) {
            case LIR_lshi:
                SLL(rr, ra, rhsc&31);
                goto done;
            case LIR_rshui:
                SRL(rr, ra, rhsc&31);
                goto done;
            case LIR_rshi:
                SRA(rr, ra, rhsc&31);
                goto done;
            default:
                break;
            }
        }

        // general case, put rhs in register
        rb = (rhs == lhs) ? ra : findRegFor(rhs, allow);
        NanoAssert(deprecated_isKnownReg(rb));
        allow &= ~rmask(rb);

        // The register allocator will have set up one of these 4 cases
        // rr==ra && ra==rb              r0 = r0 op r0
        // rr==ra && ra!=rb              r0 = r0 op r1
        // rr!=ra && ra==rb              r0 = r1 op r1
        // rr!=ra && ra!=rb && rr!=rb    r0 = r1 op r2
        NanoAssert(ra == rb || rr != rb);

        switch (op) {
            case LIR_addxovi:
            case LIR_addjovi:
                // add with overflow result into $at
                // overflow is indicated by (sign(rr)^sign(ra)) & (sign(rr)^sign(rb))

                // [move $t,$ra]        if (rr==ra)
                // addu  $rr,$ra,$rb
                // ; Generate sign($rr)^sign($ra)
                // [xor  $at,$rr,$t]    sign($at)=sign($rr)^sign($t) if (rr==ra)
                // [xor  $at,$rr,$ra]   sign($at)=sign($rr)^sign($ra) if (rr!=ra)
                // ; Generate sign($rr)^sign($rb) if $ra!=$rb
                // [xor  $t,$rr,$rb]    if (ra!=rb)
                // [and  $at,$t]        if (ra!=rb)
                // srl   $at,31

                t = ZERO;
                if (rr == ra || ra != rb)
                    t = registerAllocTmp(allow);
                SRL(AT, AT, 31);
                if (ra != rb) {
                    AND(AT, AT, t);
                    XOR(t, rr, rb);
                }
                if (rr == ra)
                    XOR(AT, rr, t);
                else
                    XOR(AT, rr, ra);
                ADDU(rr, ra, rb);
                if (rr == ra)
                    MOVE(t, ra);
                break;
            case LIR_addi:
                ADDU(rr, ra, rb);
                break;
            case LIR_andi:
                AND(rr, ra, rb);
                break;
            case LIR_ori:
                OR(rr, ra, rb);
                break;
            case LIR_xori:
                XOR(rr, ra, rb);
                break;
            case LIR_subxovi:
            case LIR_subjovi:
                // subtract with overflow result into $at
                // overflow is indicated by (sign(ra)^sign(rb)) & (sign(rr)^sign(ra))

                // [move $t,$ra]        if (rr==ra)
                // ; Generate sign($at)=sign($ra)^sign($rb)
                // xor   $at,$ra,$rb
                // subu  $rr,$ra,$rb
                // ; Generate sign($t)=sign($rr)^sign($ra)
                // [xor  $t,$rr,$ra]    if (rr!=ra)
                // [xor  $t,$rr,$t]     if (rr==ra)
                // and   $at,$at,$t
                // srl   $at,$at,31

                if (ra == rb) {
                    // special case for (ra == rb) which can't overflow
                    MOVE(AT, ZERO);
                    SUBU(rr, ra, rb);
                }
                else {
                    t = registerAllocTmp(allow);
                    SRL(AT, AT, 31);
                    AND(AT, AT, t);
                    if (rr == ra)
                        XOR(t, rr, t);
                    else
                        XOR(t, rr, ra);
                    SUBU(rr, ra, rb);
                    XOR(AT, ra, rb);
                    if (rr == ra)
                        MOVE(t, ra);
                }
                break;
            case LIR_subi:
                SUBU(rr, ra, rb);
                break;
            case LIR_lshi:
                // SLLV uses the low-order 5 bits of rb for the shift amount so no masking required
                SLLV(rr, ra, rb);
                break;
            case LIR_rshi:
                // SRAV uses the low-order 5 bits of rb for the shift amount so no masking required
                SRAV(rr, ra, rb);
                break;
            case LIR_rshui:
                // SRLV uses the low-order 5 bits of rb for the shift amount so no masking required
                SRLV(rr, ra, rb);
                break;
            case LIR_mulxovi:
            case LIR_muljovi:
                t = registerAllocTmp(allow);
                // Overflow indication required
                // Do a 32x32 signed multiply generating a 64 bit result
                // Compare bit31 of the result with the high order bits
                // mult $ra,$rb
                // mflo $rr             # result to $rr
                // sra  $t,$rr,31       # $t = 0x00000000 or 0xffffffff
                // mfhi $at
                // xor  $at,$at,$t      # sets $at to nonzero if overflow
                XOR(AT, AT, t);
                MFHI(AT);
                SRA(t, rr, 31);
                MFLO(rr);
                MULT(ra, rb);
                break;
            case LIR_muli:
                MUL(rr, ra, rb);
                break;
            default:
                BADOPCODE(op);
        }
    done:
        TAG("asm_arith(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

    void Assembler::asm_store64(LOpcode op, LIns *value, int dr, LIns *base)
    {
        // NanoAssert((dr & 7) == 0);
#if NANOJIT_64BIT
        NanoAssert (op == LIR_stq || op == LIR_std2f || op == LIR_std);
#else
        NanoAssert (op == LIR_std2f || op == LIR_std);
#endif

        switch (op) {
            case LIR_std:
                if (cpu_has_fpu) {
                    Register rbase = findRegFor(base, GpRegs);

                    if (value->isImmD())
                        asm_store_imm64(value, dr, rbase);
                    else {
                        Register fr = findRegFor(value, FpRegs);
                        asm_ldst64(true, fr, dr, rbase);
                    }
                }
                else {
                    Register rbase = findRegFor(base, GpRegs);
                    // *(uint64_t*)(rb+dr) = *(uint64_t*)(FP+da)

                    int ds = findMemFor(value);

                    // lw $at,ds(FP)
                    // sw $at,dr($rbase)
                    // lw $at,ds+4(FP)
                    // sw $at,dr+4($rbase)
                    SW(AT, dr+4, rbase);
                    LW(AT, ds+4, FP);
                    SW(AT, dr,   rbase);
                    LW(AT, ds,   FP);
                }

                break;
            case LIR_std2f:
                NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
                return;
            default:
                BADOPCODE(op);
                return;
        }

        TAG("asm_store64(value=%p{%s}, dr=%d, base=%p{%s})",
            value, lirNames[value->opcode()], dr, base, lirNames[base->opcode()]);
    }

    bool Assembler::canRemat(LIns* ins)
    {
        return ins->isImmI() || ins->isop(LIR_allocp);
    }

    void Assembler::asm_restore(LIns *i, Register r)
    {
        int d;
        if (i->isop(LIR_allocp)) {
            d = deprecated_disp(i);
            if (isS16(d))
                ADDIU(r, FP, d);
            else {
                ADDU(r, FP, AT);
                asm_li(AT, d);
            }
        }
        else if (i->isImmI()) {
            asm_li(r, i->immI());
        }
        else {
            d = findMemFor(i);
            if (IsFpReg(r)) {
                asm_ldst64(false, r, d, FP);
            }
            else {
                asm_ldst(OP_LW, r, d, FP);
            }
        }
        TAG("asm_restore(i=%p{%s}, r=%d)", i, lirNames[i->opcode()], r);
    }

    void Assembler::asm_cmp(LOpcode condop, LIns *a, LIns *b, Register cr)
    {
        RegisterMask allow = isCmpDOpcode(condop) ? FpRegs : GpRegs;
        Register ra = findRegFor(a, allow);
        Register rb = (b==a) ? ra : findRegFor(b, allow & ~rmask(ra));

        // FIXME: Use slti if b is small constant

        /* Generate the condition code */
        switch (condop) {
        case LIR_eqi:
            SLTIU(cr,cr,1);
            XOR(cr,ra,rb);
            break;
        case LIR_lti:
            SLT(cr,ra,rb);
            break;
        case LIR_gti:
            SLT(cr,rb,ra);
            break;
        case LIR_lei:
            XORI(cr,cr,1);
            SLT(cr,rb,ra);
            break;
        case LIR_gei:
            XORI(cr,cr,1);
            SLT(cr,ra,rb);
            break;
        case LIR_ltui:
            SLTU(cr,ra,rb);
            break;
        case LIR_gtui:
            SLTU(cr,rb,ra);
            break;
        case LIR_leui:
            XORI(cr,cr,1);
            SLTU(cr,rb,ra);
            break;
        case LIR_geui:
            XORI(cr,cr,1);
            SLTU(cr,ra,rb);
            break;
        case LIR_eqd:
            C_EQ_D(ra,rb);
            break;
        case LIR_ltd:
            C_LT_D(ra,rb);
            break;
        case LIR_gtd:
            C_LT_D(rb,ra);
            break;
        case LIR_led:
            C_LE_D(ra,rb);
            break;
        case LIR_ged:
            C_LE_D(rb,ra);
            break;
        default:
            debug_only(outputf("%s",lirNames[condop]);)
            TODO(asm_cond);
        }
    }

#define SEG(addr) (uint32_t(addr) & 0xf0000000)
#define SEGOFFS(addr) (uint32_t(addr) & 0x0fffffff)


    // Check that the branch target is in range
    // Generate a trampoline if it isn't
    // Emits the branch delay slot instruction
    NIns* Assembler::asm_branchtarget(NIns * const targ)
    {
        bool inrange;
        NIns *btarg = targ;

        // do initial underrun check here to ensure that inrange test is correct
        // allow
        if (targ)
            underrunProtect(2 * 4);    // branch + delay slot

        // MIPS offsets are based on the address of the branch delay slot
        // which is the next instruction that will be generated
        ptrdiff_t bd = BOFFSET(targ-1);

#if PEDANTIC
        inrange = false;
#else
        inrange = (targ && isS16(bd));
#endif

        // If the branch target is known and in range we can just generate a branch
        // Otherwise generate a branch to a trampoline that will be stored in the
        // literal area
        if (inrange)
            NOP();
        else {
            NIns *tramp = _nSlot;
            if (targ) {
                // Can the target be reached by a jump instruction?
                if (SEG(targ) == SEG(tramp)) {
                    //  [linkedinstructions]
                    //  bxxx trampoline
                    //   nop
                    //  ...
                    // trampoline:
                    //  j targ
                    //   nop

                    underrunProtect(4 * 4);             // keep bxx and trampoline together

                    NOP();                              // delay slot

                    // NB trampoline code is emitted in the correct order
                    trampJ(targ);
                    trampNOP();                         // trampoline delay slot

                }
                else {
                    //  [linkedinstructions]
                    //  bxxx trampoline
                    //   lui $at,%hi(targ)
                    //  ...
                    // trampoline:
                    //  addiu $at,%lo(targ)
                    //  jr $at
                    //   nop

                    underrunProtect(5 * 4);             // keep bxx and trampoline together

                    LUI(AT,hi(uint32_t(targ)));         // delay slot

                    // NB trampoline code is emitted in the correct order
                    trampADDIU(AT, AT, lo(uint32_t(targ)));
                    trampJR(AT);
                    trampNOP();                         // trampoline delay slot

                }
            }
            else {
                // Worst case is bxxx,lui addiu;jr;nop as above
                // Best case is branch to trampoline can be replaced
                // with branch to target in which case the trampoline will be abandoned
                // Fixup handled in nPatchBranch

                underrunProtect(5 * 4);                 // keep bxx and trampoline together

                NOP();                                  // delay slot

                trampNOP();
                trampNOP();
                trampNOP();

            }
            btarg = tramp;
        }

        return btarg;
    }


    NIns* Assembler::asm_bxx(bool branchOnFalse, LOpcode condop, Register ra, Register rb, NIns * const targ)
    {
        NIns *patch = NULL;
        NIns *btarg = asm_branchtarget(targ);

        if (cpu_has_fpu && isCmpDOpcode(condop)) {
            // c.xx.d $ra,$rb
            // bc1x   btarg
            switch (condop) {
            case LIR_eqd:
                if (branchOnFalse)
                    BC1F(btarg);
                else
                    BC1T(btarg);
                patch = _nIns;
                if (cpu_has_fpuhazard)
                    NOP();
                C_EQ_D(ra, rb);
                break;
            case LIR_ltd:
                if (branchOnFalse)
                    BC1F(btarg);
                else
                    BC1T(btarg);
                patch = _nIns;
                if (cpu_has_fpuhazard)
                    NOP();
                C_LT_D(ra, rb);
                break;
            case LIR_gtd:
                if (branchOnFalse)
                    BC1F(btarg);
                else
                    BC1T(btarg);
                patch = _nIns;
                if (cpu_has_fpuhazard)
                    NOP();
                C_LT_D(rb, ra);
                break;
            case LIR_led:
                if (branchOnFalse)
                    BC1F(btarg);
                else
                    BC1T(btarg);
                patch = _nIns;
                if (cpu_has_fpuhazard)
                    NOP();
                C_LE_D(ra, rb);
                break;
            case LIR_ged:
                if (branchOnFalse)
                    BC1F(btarg);
                else
                    BC1T(btarg);
                patch = _nIns;
                if (cpu_has_fpuhazard)
                    NOP();
                C_LE_D(rb, ra);
                break;
            default:
                BADOPCODE(condop);
                break;
            }
        }
        else {
            // general case
            // s[lg]tu?   $at,($ra,$rb|$rb,$ra)
            // b(ne|eq)z  $at,btarg
            switch (condop) {
            case LIR_eqi:
                // special case
                // b(ne|eq)  $ra,$rb,btarg
                if (branchOnFalse)
                    BNE(ra, rb, btarg);
                else {
                    if (ra == rb)
                        B(btarg);
                    else
                        BEQ(ra, rb, btarg);
                }
                patch = _nIns;
                break;
            case LIR_lti:
                if (branchOnFalse)
                    BEQ(AT, ZERO, btarg);
                else
                    BNE(AT, ZERO, btarg);
                patch = _nIns;
                SLT(AT, ra, rb);
                break;
            case LIR_gti:
                if (branchOnFalse)
                    BEQ(AT, ZERO, btarg);
                else
                    BNE(AT, ZERO, btarg);
                patch = _nIns;
                SLT(AT, rb, ra);
                break;
            case LIR_lei:
                if (branchOnFalse)
                    BNE(AT, ZERO, btarg);
                else
                    BEQ(AT, ZERO, btarg);
                patch = _nIns;
                SLT(AT, rb, ra);
                break;
            case LIR_gei:
                if (branchOnFalse)
                    BNE(AT, ZERO, btarg);
                else
                    BEQ(AT, ZERO, btarg);
                patch = _nIns;
                SLT(AT, ra, rb);
                break;
            case LIR_ltui:
                if (branchOnFalse)
                    BEQ(AT, ZERO, btarg);
                else
                    BNE(AT, ZERO, btarg);
                patch = _nIns;
                SLTU(AT, ra, rb);
                break;
            case LIR_gtui:
                if (branchOnFalse)
                    BEQ(AT, ZERO, btarg);
                else
                    BNE(AT, ZERO, btarg);
                patch = _nIns;
                SLTU(AT, rb, ra);
                break;
            case LIR_leui:
                if (branchOnFalse)
                    BNE(AT, ZERO, btarg);
                else
                    BEQ(AT, ZERO, btarg);
                patch = _nIns;
                SLT(AT, rb, ra);
                break;
            case LIR_geui:
                if (branchOnFalse)
                    BNE(AT, ZERO, btarg);
                else
                    BEQ(AT, ZERO, btarg);
                patch = _nIns;
                SLTU(AT, ra, rb);
                break;
            default:
                BADOPCODE(condop);
            }
        }
        TAG("asm_bxx(branchOnFalse=%d, condop=%s, ra=%s rb=%s targ=%p)",
            branchOnFalse, lirNames[condop], gpn(ra), gpn(rb), targ);
        return patch;
    }

    NIns* Assembler::asm_branch_ov(LOpcode op, NIns* target)
    {
        USE(op);
        NanoAssert(target != NULL);

        NIns* patch = asm_bxx(true, LIR_eqi, AT, ZERO, target);

        TAG("asm_branch_ov(op=%s, target=%p)", lirNames[op], target);
        return patch;
    }

    Branches Assembler::asm_branch(bool branchOnFalse, LIns *cond, NIns * const targ)
    {
        NanoAssert(cond->isCmp());
        LOpcode condop = cond->opcode();
        RegisterMask allow = isCmpDOpcode(condop) ? FpRegs : GpRegs;
        LIns *a = cond->oprnd1();
        LIns *b = cond->oprnd2();
        Register ra = findRegFor(a, allow);
        Register rb = (b==a) ? ra : findRegFor(b, allow & ~rmask(ra));

        return Branches(asm_bxx(branchOnFalse, condop, ra, rb, targ));
    }

    void Assembler::asm_j(NIns * const targ, bool bdelay)
    {
        if (targ == NULL) {
            NanoAssert(bdelay);
            (void) asm_bxx(false, LIR_eqi, ZERO, ZERO, targ);
        }
        else {
            NanoAssert(SEG(targ) == SEG(_nIns));
            if (bdelay) {
                underrunProtect(2*4);    // j + delay
                NOP();
            }
            J(targ);
        }
        TAG("asm_j(targ=%p) bdelay=%d", targ);
    }

    void
    Assembler::asm_spill(Register rr, int d, bool quad)
    {
        USE(quad);
        NanoAssert(d);
        if (IsFpReg(rr)) {
            NanoAssert(quad);
            asm_ldst64(true, rr, d, FP);
        }
        else {
            NanoAssert(!quad);
            asm_ldst(OP_SW, rr, d, FP);
        }
        TAG("asm_spill(rr=%d, d=%d, quad=%d)", rr, d, quad);
    }

    void
    Assembler::asm_nongp_copy(Register dst, Register src)
    {
        NanoAssert ((rmask(dst) & FpRegs) && (rmask(src) & FpRegs));
        MOV_D(dst, src);
        TAG("asm_nongp_copy(dst=%d src=%d)", dst, src);
    }

    /*
     * asm_arg will encode the specified argument according to the current ABI, and
     * will update r and stkd as appropriate so that the next argument can be
     * encoded.
     *
     * - doubles are 64-bit aligned.  both in registers and on the stack.
     *   If the next available argument register is A1, it is skipped
     *   and the double is placed in A2:A3.  If A0:A1 or A2:A3 are not
     *   available, the double is placed on the stack, 64-bit aligned.
     * - 32-bit arguments are placed in registers and 32-bit aligned
     *   on the stack.
     */
    void
    Assembler::asm_arg(ArgType ty, LIns* arg, Register& r, Register& fr, int& stkd)
    {
        // The stack offset must always be at least aligned to 4 bytes.
        NanoAssert((stkd & 3) == 0);

        if (ty == ARGTYPE_D) {
            // This task is fairly complex and so is delegated to asm_arg_64.
            asm_arg_64(arg, r, fr, stkd);
        } else {
            NanoAssert(ty == ARGTYPE_I || ty == ARGTYPE_UI);
            if (stkd < 16) {
                asm_regarg(ty, arg, r);
                fr = Register(fr + 1);
                r = Register(r + 1);
            }
            else
                asm_stkarg(arg, stkd);
            // The o32 ABI calling convention is that if the first arguments
            // is not a double, subsequent double values are passed in integer registers
            fr = r;
            stkd += 4;
        }
    }

    void
    Assembler::asm_call(LIns* ins)
    {
        if (!ins->isop(LIR_callv)) {
            Register rr;
            LOpcode op = ins->opcode();

            switch (op) {
            case LIR_calli:
                rr = retRegs[0];
                break;
            case LIR_calld:
                NanoAssert(cpu_has_fpu);
                rr = FV0;
                break;
            default:
                BADOPCODE(op);
                return;
            }

            deprecated_prepResultReg(ins, rmask(rr));
        }

        // Do this after we've handled the call result, so we don't
        // force the call result to be spilled unnecessarily.
        evictScratchRegsExcept(0);

        const CallInfo* ci = ins->callInfo();
        ArgType argTypes[MAXARGS];
        uint32_t argc = ci->getArgTypes(argTypes);
        bool indirect = ci->isIndirect();

        // FIXME: Put one of the argument moves into the BDS slot

        underrunProtect(2*4);    // jalr+delay
        NOP();
        JALR(T9);

        if (!indirect)
            // FIXME: If we can tell that we are calling non-PIC
            // (ie JIT) code, we could call direct instead of using t9
            asm_li(T9, ci->_address);
        else
            // Indirect call: we assign the address arg to t9
            // which matches the o32 ABI for calling functions
            asm_regarg(ARGTYPE_P, ins->arg(--argc), T9);

        // Encode the arguments, starting at A0 and with an empty argument stack.
        Register    r = A0, fr = FA0;
        int         stkd = 0;

        // Iterate through the argument list and encode each argument according to
        // the ABI.
        // Note that we loop through the arguments backwards as LIR specifies them
        // in reverse order.
        while(argc--)
            asm_arg(argTypes[argc], ins->arg(argc), r, fr, stkd);

        if (stkd > max_out_args)
            max_out_args = stkd;
        TAG("asm_call(ins=%p{%s})", ins, lirNames[ins->opcode()]);
    }

    Register
    Assembler::nRegisterAllocFromSet(RegisterMask set)
    {
        Register i;
        int n;

        // note, deliberate truncation of 64->32 bits
        if (set & 0xffffffff) {
            // gp reg
            n = ffs(int(set));
            NanoAssert(n != 0);
            i = Register(n - 1);
        }
        else {
            // fp reg
            NanoAssert(cpu_has_fpu);
            n = ffs(int(set >> 32));
            NanoAssert(n != 0);
            i = Register(32 + n - 1);
        }
        _allocator.free &= ~rmask(i);
        TAG("nRegisterAllocFromSet(set=%016llx) => %s", set, gpn(i));
        return i;
    }

    void
    Assembler::nRegisterResetAll(RegAlloc& regs)
    {
        regs.clear();
        regs.free = GpRegs;
        if (cpu_has_fpu)
            regs.free |= FpRegs;
    }

#define signextend16(s) ((int32_t(s)<<16)>>16)

    void
    Assembler::nPatchBranch(NIns* branch, NIns* target)
    {
        uint32_t op = (branch[0] >> 26) & 0x3f;
        uint32_t bdoffset = target-(branch+1);

        if (op == OP_BEQ || op == OP_BNE ||
            ((branch[0] & 0xfffe0000) == ((OP_COP1 << 26) | (COP1_BC << 21)))) {
            if (isS16(bdoffset)) {
                // The branch is in range, so just replace the offset in the instruction
                // The trampoline that was allocated is redundant and will remain unused
                branch[0] = (branch[0]  & 0xffff0000) | (bdoffset & 0xffff);
            }
            else {
                // The branch is pointing to a trampoline. Find out where that is
                NIns *tramp = branch + 1 + (signextend16(branch[0] & 0xffff));
                if (SEG(branch) == SEG(target)) {
                    *tramp = J_FORMAT(OP_J,JINDEX(target));
                }
                else {
                    // Full 32-bit jump
                    // bxx tramp
                    //  lui $at,(target>>16)>0xffff
                    // ..
                    // tramp:
                    // ori $at,target & 0xffff
                    // jr $at
                    //  nop
                    branch[1] = U_FORMAT(OP_LUI,0,AT,hi(uint32_t(target)));
                    tramp[0] = U_FORMAT(OP_ADDIU,AT,AT,lo(uint32_t(target)));
                    tramp[1] = R_FORMAT(OP_SPECIAL,AT,0,0,0,SPECIAL_JR);
                }
            }
        }
        else if (op == OP_J) {
            NanoAssert (SEG(branch) == SEG(target));
            branch[0] = J_FORMAT(OP_J,JINDEX(target));
        }
        else
            TODO(unknown_patch);
        // TAG("nPatchBranch(branch=%p target=%p)", branch, target);
    }

    void
    Assembler::nFragExit(LIns *guard)
    {
        SideExit *exit = guard->record()->exit;
        Fragment *frag = exit->target;
        bool destKnown = (frag && frag->fragEntry);

        // Generate jump to epilogue and initialize lr.

        // If the guard already exists, use a simple jump.
        if (destKnown) {
            // j     _fragEntry
            //  move $v0,$zero
            underrunProtect(2 * 4);     // j + branch delay
            MOVE(V0, ZERO);
            asm_j(frag->fragEntry, false);
        }
        else {
            // Target doesn't exist. Jump to an epilogue for now.
            // This can be patched later.
            if (!_epilogue)
                _epilogue = genEpilogue();
            GuardRecord *lr = guard->record();
            // FIXME: _epilogue may be in another segment
            // lui    $v0,%hi(lr)
            // j      _epilogue
            //  addiu $v0,%lo(lr)
            underrunProtect(2 * 4);     // j + branch delay
            ADDIU(V0, V0, lo(int32_t(lr)));
            asm_j(_epilogue, false);
            LUI(V0, hi(int32_t(lr)));
            lr->jmp = _nIns;
        }

        // profiling for the exit
        verbose_only(
            if (_logc->lcbits & LC_FragProfile) {
                // lui   $fp,%hi(profCount)
                // lw    $at,%lo(profCount)(fp)
                // addiu $at,1
                // sw    $at,%lo(profCount)(fp)
                uint32_t profCount = uint32_t(&guard->record()->profCount);
                SW(AT, lo(profCount), FP);
                ADDIU(AT, AT, 1);
                LW(AT, lo(profCount), FP);
                LUI(FP, hi(profCount));
            }
        )

        // Pop the stack frame.
        MOVE(SP, FP);

        // return value is GuardRecord*
        TAG("nFragExit(guard=%p{%s})", guard, lirNames[guard->opcode()]);
    }

    void
    Assembler::nInit()
    {
        nHints[LIR_calli]  = rmask(V0);
#if NJ_SOFTFLOAT_SUPPORTED
        nHints[LIR_hcalli] = rmask(V1);
#endif
        nHints[LIR_calld]  = rmask(FV0);
        nHints[LIR_paramp] = PREFER_SPECIAL;
    }

    void Assembler::nBeginAssembly()
    {
        max_out_args = 16;        // Always reserve space for a0-a3
    }

    // Increment the 32-bit profiling counter at pCtr, without
    // changing any registers.
    verbose_only(
    void Assembler::asm_inc_m32(uint32_t* /*pCtr*/)
    {
        // TODO: implement this
    }
    )

    void
    Assembler::nativePageReset(void)
    {
        _nSlot = 0;
        _nExitSlot = 0;
        TAG("nativePageReset()");
    }

    void
    Assembler::nativePageSetup(void)
    {
        NanoAssert(!_inExit);
        if (!_nIns)
            codeAlloc(codeStart, codeEnd, _nIns verbose_only(, codeBytes));
        if (!_nExitIns)
            codeAlloc(exitStart, exitEnd, _nExitIns verbose_only(, exitBytes));

        // constpool starts at bottom of page and moves up
        // code starts at top of page and goes down,

        if (!_nSlot)
            _nSlot = codeStart;
        if (!_nExitSlot)
            _nExitSlot = exitStart;

        TAG("nativePageSetup()");
    }


    NIns*
    Assembler::genPrologue(void)
    {
        /*
         * Use a non standard fp because we don't know the final framesize until now
         * addiu  $sp,-FRAMESIZE
         * sw      $ra,RA_OFFSET($sp)
         * sw      $fp,FP_OFFSET($sp)
         * move   $fp,$sp
         * addu      $sp,-stackNeeded
         */

        uint32_t stackNeeded = max_out_args + STACK_GRANULARITY * _activation.stackSlotsNeeded();
        uint32_t amt = alignUp(stackNeeded, NJ_ALIGN_STACK);

        if (amt) {
            if (isS16(-amt))
                ADDIU(SP, SP, -amt);
            else {
                ADDU(SP, SP, AT);
                asm_li(AT, -amt);
            }
        }

        NIns *patchEntry = _nIns; // FIXME: who uses this value and where should it point?

        MOVE(FP, SP);
        SW(FP, FP_OFFSET, SP);
        SW(RA, RA_OFFSET, SP);        // No need to save for leaf functions
        ADDIU(SP, SP, -FRAMESIZE);

        TAG("genPrologue()");

        return patchEntry;
    }

    NIns*
    Assembler::genEpilogue(void)
    {
        /*
         * move    $sp,$fp
         * lw      $ra,RA_OFFSET($sp)
         * lw      $fp,FP_OFFSET($sp)
         * j       $ra
         * addiu   $sp,FRAMESIZE
         */
        underrunProtect(2*4);   // j $ra; addiu $sp,FRAMESIZE
        ADDIU(SP, SP, FRAMESIZE);
        JR(RA);
        LW(FP, FP_OFFSET, SP);
        LW(RA, RA_OFFSET, SP);
        MOVE(SP, FP);

        TAG("genEpilogue()");

        return _nIns;
    }

    RegisterMask
    Assembler::nHint(LIns* ins)
    {
        NanoAssert(ins->isop(LIR_paramp));
        RegisterMask prefer = 0;
        // FIXME: FLOAT parameters?
        if (ins->paramKind() == 0)
            if (ins->paramArg() < 4)
                prefer = rmask(argRegs[ins->paramArg()]);
        return prefer;
    }

    void
    Assembler::underrunProtect(int bytes)
    {
        NanoAssertMsg(bytes<=LARGEST_UNDERRUN_PROT, "constant LARGEST_UNDERRUN_PROT is too small");
        NanoAssert(_nSlot != 0);
        uintptr_t top = uintptr_t(_nSlot);
        uintptr_t pc = uintptr_t(_nIns);
        if (pc - bytes < top) {
            verbose_only(verbose_outputf("        %p:", _nIns);)
            NIns* target = _nIns;
            codeAlloc(codeStart, codeEnd, _nIns verbose_only(, codeBytes));

            _nSlot = codeStart;

            // _nSlot points to the first empty position in the new code block
            // _nIns points just past the last empty position.
            asm_j(target, true);
        }
    }

    void
    Assembler::swapCodeChunks() {
        if (!_nExitIns)
            codeAlloc(exitStart, exitEnd, _nExitIns verbose_only(, exitBytes));
        if (!_nExitSlot)
            _nExitSlot = exitStart;
        SWAP(NIns*, _nIns, _nExitIns);
        SWAP(NIns*, _nSlot, _nExitSlot);
        SWAP(NIns*, codeStart, exitStart);
        SWAP(NIns*, codeEnd, exitEnd);
        verbose_only( SWAP(size_t, codeBytes, exitBytes); )
    }

    void
    Assembler::asm_insert_random_nop() {
        NanoAssert(0); // not supported
    }

}

#endif // FEATURE_NANOJIT && NANOJIT_MIPS