Content - 8e76a74993a4ee7564c9486947d9c93d77eb267b - a3a2416/BlockMultiplier.h

BlockMultiplier.h
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full licence information.
//
#pragma once
#include "BlockMultiplierPlatform.h"
#include <malloc.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include <immintrin.h>
#include <assert.h>
#include <cstdint>
#include <iostream>
#include <exception>
#include <thread>
#include <mutex>
#include <memory>
#include <vector>
#include "BlockMultiplierMatrixUtil.h"
#include "BlockHandlerSSE.h"
#ifdef SUPPORT_AVX2
#include "BlockHandlerAVX.h"
#endif
//#define STDTHREAD
#define OPENMPTHREAD
#ifdef STDTHREAD
#include "StdThreadPool.h"
#else 
#ifdef OPENMPTHREAD
#include <omp.h>
#endif
#endif

namespace Microsoft { namespace MSR { namespace CNTK {

// All of the information needed for the various BlockHandler functions.
// Packaged into a struct because in some cases we call these functions on a thread pool
// and most thread pool class interfaces aren't happy with lots of arguments to the worker
// thread function.
//
// Throughout the code, I use the following nomenclature:
// A is the LHS matrix for the multiplication, B is the RHS, and C is the result.
// (i.e. A * B = C).
// m, k, and n are the matrix dimensions.
// m is the number of rows in A and C.
// k is the common dimension (the number of columns in A and rows in B).
// n is the number of columns in B and C.
// In other words, A is (m x k), B is (k x n) and C is (m x n).
//
template<typename BlockHandlerT>struct HandlerArgs
{
    int startRow;
    int blocks;
    int m;
    int k;
    int n;
    typename BlockHandlerT::ScalarAT* newA;
    typename BlockHandlerT::ScalarBT* B;
    int32_t* transC;
    int rowsPerThread;
    int rowsPerBlock;
    typename BlockHandlerT::VectorT* pBlockPreparedB;
};


// BlockMultiplier is a GEMM implementation that is optimized by
// reordering matrices so that they will be accessed sequentially
// in memory at multiplication time. This reduces cache misses and
// results in faster-executing code on the CPU.
// This class handles the rewriting of the matrices and the top level
// multiplication. Blocks of A and B (the LHS and RHS of the multiplication)
// are then handed off to a class implementing the BlockHandlerT interface.
// Implementations are provided for multiplying 16-bit integer matrices using
// the SSE and AVX2 instruction sets. To compile for AVX2, you need to add the /arch:AVX2
// flag to the compiler. Note that the AVX2 code only runs on Haswell or better processors,
// will throw illegal instruction on other machines.
// To use the code, first call PrepareB, which rewrites B in block order and returns
// a pointer to the rewritten block (don't forget to call FreePreparedB on it when you're done
// multiplying by that matrix). Then you can call MultiplyMatrices().
// For details on how the block rewrite works, see the comments on RewriteBInBlockOrder and
// RewriteAInBlockOrder.
template<typename BlockHandlerT> class BlockMultiplier
{
    public:
        // The type of the element in the A (LHS) matrix
        typedef typename BlockHandlerT::ScalarAT ScalarAT;
        // The type of the element in the B (RHS) matrix
        typedef typename BlockHandlerT::ScalarBT ScalarBT;
        // Right now we always produce 32-bit integer results. This could be changed if necessary.
        typedef int32_t ScalarCT;
        // The vectorized type of the block multiplier (e.g. __m128i for SSE, __m256 for AVX2).
        typedef typename BlockHandlerT::VectorT VectorT;

    private:
        BlockHandlerT m_handler;
        int referenceKernel(ScalarAT* A, ScalarBT* B, int blockSize);
        void kernelsse8x2(__m128i xmmRow0, __m128i xmmRow1, short* B, short* return1, short* return2);
        int RowToColOffsetRewrittenB(int col, int kOffset, int blockSize, int origCols);
        int RowToColOffsetRewrittenA(int row, int kOffset, int blockSize, int rowsPerBlock, int origCols);
        void RewriteBInBlockOrder(ScalarBT* oldB, ScalarBT* newB, int k, int n);
        ScalarBT* RewriteBInBlockOrder(ScalarBT* oldB, ScalarBT* newB, int k, int n, int blockSize, int* kOffset);
        void RewriteAInBlockOrder(ScalarAT* A, ScalarAT* newA, int m, int k, int blockSize, int rowsPerBlock);
        ScalarAT* RewriteAInBlockOrder(ScalarAT* A, ScalarAT* newA, int m, int k, int blockSize, int rowsPerBlock, int* pKOffset);
        ScalarAT* RewriteAInBlockOrder2(ScalarAT* A, ScalarAT* newA, int m, int k, int blockSize, int rowsPerBlock, int* pKOffset);
        int m_blockSize;
        std::mutex m_MultiplyMut;

        // Function objects - thin wrappers around the thread functions (which know how to feed
        // blocks to the actualy dot product kernels implemented in BlockHandlerT).

        class BlockHandler128x4Fn 
        {
            public:
                void operator()(HandlerArgs<BlockHandlerT> param) { BlockHandler128x4Thread(param); }
        };
        class BlockHandler64x4Fn 
        {
            public:
                void operator()(HandlerArgs<BlockHandlerT> param) { BlockHandler64x4Thread(param); }
        };
        class BlockHandler32x4Fn 
        {
            public:
                void operator()(HandlerArgs<BlockHandlerT> param) { BlockHandler32x4Thread(param); }
        };
        class BlockHandler16x4Fn 
        {
            public:
                void operator()(HandlerArgs<BlockHandlerT> param) { BlockHandler16x4Thread(param); }
        };
        class BlockHandler8x4Fn 
        {
            public:
                void operator()(HandlerArgs<BlockHandlerT> param) { BlockHandler8x4Thread(param); }
        };
        class BlockHandler128x1Fn 
        {
            public:
                void operator()(HandlerArgs<BlockHandlerT> param) { BlockHandler128x1Thread(param); }
        };
        class BlockHandler64x1Fn 
        {
            public:
                void operator()(HandlerArgs<BlockHandlerT> param) { BlockHandler64x1Thread(param); }
        };
        class BlockHandler32x1Fn 
        {
            public:
                void operator()(HandlerArgs<BlockHandlerT> param) { BlockHandler32x1Thread(param); }
        };
        class BlockHandler16x1Fn
        {
            public:
                void operator()(HandlerArgs<BlockHandlerT> param) { BlockHandler16x1Thread(param); }
        };
        class BlockHandler8x1Fn
        {
            public:
                void operator()(HandlerArgs<BlockHandlerT> param) { BlockHandler8x1Thread(param); }
        };

#ifdef STDTHREAD
        std::unique_ptr<StdThreadPool<HandlerArgs<BlockHandlerT>>> m_pPool;
#endif

        typename BlockHandlerT::VectorT* m_pBlockHandlerBInfo;


        // All of the BlockHandlerFooxBarThread functions work the same way.
        // FooxBar means we are processing Foo elements from the common dimension K,
        // Bar rows at a time. So 128x4 means we are processing four rows at a time, with
        // 128 entries of the common dimension (k) at a time.
        //
        // We walk through all of the blocks for this set of rows in the common dimension, accumulating
        // the partial sums in resultStorage as we go. These temporary data are then copied into the target
        // matrix (C). Accumulating directly in C is a disaster because you end up with read-write hazards
        // in multithreaded situations and end up with lots of pipeline stalls trying to reconcile the cache, so
        // this ends up being faster even through we're allocating the memory each time. I think we can
        // get rid of the allocation but I haven't had time to do this yet.
        static void BlockHandler128x4Thread(HandlerArgs<BlockHandlerT> ha)
        {
            //Accumulate full row results locally b/f writing to C
            VectorT* resultStorage = (VectorT*)ALIGNED_ALLOC(sizeof(VectorT) * ha.rowsPerBlock * ha.n, 16);
            memset(resultStorage, 0, sizeof(VectorT) * ha.rowsPerBlock * ha.n);
            const int blocksAtOnce = 2;

            int32_t* transC = ha.transC;

            for (int currBlock = 0; currBlock < ha.blocks; currBlock += blocksAtOnce)
            {
                BlockHandlerT::HandleBlock128x4(currBlock, ha.startRow, ha.k, ha.n, ha.newA, 
                        ha.B, std::min(ha.blocks - currBlock, blocksAtOnce), resultStorage, ha.pBlockPreparedB);
            }

            int n = ha.n;
            {
                //This takes about the same amount of time as the memcpy version below.
                for (int c = 0; c < n; ++c)
                {
                    //_mm_prefetch((char*)&(transC[RowColToOffset(c, startRow, m)]), _MM_HINT_T1);
                    VectorT result1 = resultStorage[RowColToOffset(0, c, n)];
                    VectorT result2 = resultStorage[RowColToOffset(1, c, n)];
                    VectorT result3 = resultStorage[RowColToOffset(2, c, n)];
                    VectorT result4 = resultStorage[RowColToOffset(3, c, n)];
                    int32_t firstHorizontal  = my_hadd(result1);
                    int32_t secondHorizontal = my_hadd(result2);
                    int32_t thirdHorizontal  = my_hadd(result3);
                    int32_t fourthHorizontal = my_hadd(result4);
                    transC[RowColToOffset(ha.startRow, c, n)]     = firstHorizontal;
                    transC[RowColToOffset(ha.startRow + 1, c, n)] = secondHorizontal;
                    transC[RowColToOffset(ha.startRow + 2, c, n)] = thirdHorizontal;
                    transC[RowColToOffset(ha.startRow + 3, c, n)] = fourthHorizontal;
                }
            }

            ALIGNED_FREE(resultStorage);

        }

        static void BlockHandler64x4Thread(HandlerArgs<BlockHandlerT> ha)
        {
            VectorT* resultStorage = (VectorT*)ALIGNED_ALLOC(sizeof(VectorT) * 4 * ha.n, 16);
            memset(resultStorage, 0, sizeof(VectorT) * 4 * ha.n);
            int32_t* transC = ha.transC;

            for (int currBlock = 0; currBlock < ha.blocks; ++currBlock)
            {
                BlockHandlerT::HandleBlock64x4(currBlock, ha.startRow, ha.k, ha.n, ha.newA, 
                        ha.B, 1, resultStorage);
            }

            int n = ha.n;
            {
                for (int c = 0; c < n; ++c)
                {
                    VectorT result1 = resultStorage[RowColToOffset(0, c, n)];
                    VectorT result2 = resultStorage[RowColToOffset(1, c, n)];
                    VectorT result3 = resultStorage[RowColToOffset(2, c, n)];
                    VectorT result4 = resultStorage[RowColToOffset(3, c, n)];
                    int32_t firstHorizontal  = my_hadd(result1);
                    int32_t secondHorizontal = my_hadd(result2);
                    int32_t thirdHorizontal  = my_hadd(result3);
                    int32_t fourthHorizontal = my_hadd(result4);
                    transC[RowColToOffset(ha.startRow, c, n)]     += firstHorizontal;
                    transC[RowColToOffset(ha.startRow + 1, c, n)] += secondHorizontal;
                    transC[RowColToOffset(ha.startRow + 2, c, n)] += thirdHorizontal;
                    transC[RowColToOffset(ha.startRow + 3, c, n)] += fourthHorizontal;
                }
            }
            ALIGNED_FREE(resultStorage);
        }

        static void BlockHandler32x4Thread(HandlerArgs<BlockHandlerT> ha)
        {
            VectorT* resultStorage = (VectorT*)ALIGNED_ALLOC(sizeof(VectorT) * 4 * ha.n, 16);
            memset(resultStorage, 0, sizeof(VectorT) * 4 * ha.n);
            int32_t* transC = ha.transC;

            for (int currBlock = 0; currBlock < ha.blocks; ++currBlock)
            {
                BlockHandlerT::HandleBlock32x4(currBlock, ha.startRow, ha.k, ha.n, ha.newA, 
                        ha.B, 1, resultStorage);
            }

            int n = ha.n;
            {
                for (int c = 0; c < n; ++c)
                {
                    VectorT result1 = resultStorage[RowColToOffset(0, c, n)];
                    VectorT result2 = resultStorage[RowColToOffset(1, c, n)];
                    VectorT result3 = resultStorage[RowColToOffset(2, c, n)];
                    VectorT result4 = resultStorage[RowColToOffset(3, c, n)];
                    int32_t firstHorizontal  = my_hadd(result1);
                    int32_t secondHorizontal = my_hadd(result2);
                    int32_t thirdHorizontal  = my_hadd(result3);
                    int32_t fourthHorizontal = my_hadd(result4);
                    transC[RowColToOffset(ha.startRow, c, n)]     += firstHorizontal;
                    transC[RowColToOffset(ha.startRow + 1, c, n)] += secondHorizontal;
                    transC[RowColToOffset(ha.startRow + 2, c, n)] += thirdHorizontal;
                    transC[RowColToOffset(ha.startRow + 3, c, n)] += fourthHorizontal;
                }
            }
            ALIGNED_FREE(resultStorage);
        }

        static void BlockHandler16x4Thread(HandlerArgs<BlockHandlerT> ha)
        {
            VectorT* resultStorage = (VectorT*) ALIGNED_ALLOC(sizeof(VectorT) * 4 * ha.n, 16);
            memset(resultStorage, 0, sizeof(VectorT) * 4 * ha.n);
            int32_t* transC = ha.transC;
            for (int currBlock = 0; currBlock < ha.blocks; ++currBlock)
            {
                BlockHandlerT::HandleBlock16x4(currBlock, ha.startRow, ha.k, ha.n, ha.newA, 
                        ha.B, 1, resultStorage);
            }

            int n = ha.n;
            {
                //This takes about the same amount of time as the memcpy version below.
                for (int c = 0; c < n; ++c)
                {
                    VectorT result1 = resultStorage[RowColToOffset(0, c, n)];
                    VectorT result2 = resultStorage[RowColToOffset(1, c, n)];
                    VectorT result3 = resultStorage[RowColToOffset(2, c, n)];
                    VectorT result4 = resultStorage[RowColToOffset(3, c, n)];
                    int32_t firstHorizontal  = my_hadd(result1);
                    int32_t secondHorizontal = my_hadd(result2);
                    int32_t thirdHorizontal  = my_hadd(result3);
                    int32_t fourthHorizontal = my_hadd(result4);
                    transC[RowColToOffset(ha.startRow, c, n)]     += firstHorizontal;
                    transC[RowColToOffset(ha.startRow + 1, c, n)] += secondHorizontal;
                    transC[RowColToOffset(ha.startRow + 2, c, n)] += thirdHorizontal;
                    transC[RowColToOffset(ha.startRow + 3, c, n)] += fourthHorizontal;
                }
            }
            ALIGNED_FREE(resultStorage);
        }

        static void BlockHandler8x4Thread(HandlerArgs<BlockHandlerT> ha)
        {
            __m128i* resultStorage = (__m128i*)ALIGNED_ALLOC(sizeof(__m128i) * 4 * ha.n, 16);
            memset(resultStorage, 0, sizeof(__m128i) * 4 * ha.n);
            int32_t* transC = ha.transC;
            //_mm_prefetch((char*)&(transC[RowColToOffset(c, ha.startRow, m)]), _MM_HINT_T1);

            for (int currBlock = 0; currBlock < ha.blocks; ++currBlock)
            {
                BlockHandlerT::HandleBlock8x4(currBlock, ha.startRow, ha.k, ha.n, ha.newA, 
                        ha.B, 1, resultStorage);
            }

            int n = ha.n;
            {
                for (int c = 0; c < n; ++c)
                {
                    __m128i result1 = resultStorage[RowColToOffset(0, c, n)];
                    __m128i result2 = resultStorage[RowColToOffset(1, c, n)];
                    __m128i result3 = resultStorage[RowColToOffset(2, c, n)];
                    __m128i result4 = resultStorage[RowColToOffset(3, c, n)];
                    int32_t firstHorizontal  = my_hadd(result1);
                    int32_t secondHorizontal = my_hadd(result2);
                    int32_t thirdHorizontal  = my_hadd(result3);
                    int32_t fourthHorizontal = my_hadd(result4);
                    transC[RowColToOffset(ha.startRow, c, n)]     += firstHorizontal;
                    transC[RowColToOffset(ha.startRow + 1, c, n)] += secondHorizontal;
                    transC[RowColToOffset(ha.startRow + 2, c, n)] += thirdHorizontal;
                    transC[RowColToOffset(ha.startRow + 3, c, n)] += fourthHorizontal;
                }
            }
            ALIGNED_FREE(resultStorage);
        }



        static void BlockHandler128x1Thread(HandlerArgs<BlockHandlerT> ha)
        {
            VectorT* resultStorage = (VectorT*)ALIGNED_ALLOC(sizeof(VectorT) * ha.rowsPerBlock * ha.n, 64);
            memset(resultStorage, 0, sizeof(VectorT) * ha.rowsPerBlock * ha.n);
            const int blocksAtOnce = 2;
            int32_t* transC = ha.transC;
            for (int currBlock = 0; currBlock < ha.blocks; currBlock += blocksAtOnce)
            {
                BlockHandlerT::HandleBlock128x1(currBlock, ha.startRow, ha.k, ha.n, 
                        ha.newA, ha.B, std::min(ha.blocks - currBlock, blocksAtOnce), resultStorage, ha.pBlockPreparedB);
            }

            int n = ha.n;
            {
                for (int c = 0; c < n; ++c)
                {
                    VectorT result1 = resultStorage[RowColToOffset(0, c, n)];
                    int32_t firstHorizontal = my_hadd(result1);
                    transC[RowColToOffset(ha.startRow, c, n)] = firstHorizontal;
                }
            }
            ALIGNED_FREE(resultStorage);
        }

        static void BlockHandler64x1Thread(HandlerArgs<BlockHandlerT> ha)
        {
            VectorT* resultStorage = (VectorT*)ALIGNED_ALLOC(sizeof(VectorT) * ha.rowsPerBlock * ha.n, 16);
            memset(resultStorage, 0, sizeof(VectorT) * ha.rowsPerBlock * ha.n);
            int32_t* transC = ha.transC;

            for (int currBlock = 0; currBlock < ha.blocks; ++currBlock)
            {
                BlockHandlerT::HandleBlock64x1(currBlock, ha.startRow, ha.k, 
                        ha.n, ha.newA, ha.B, 1, resultStorage);
            }

            int n = ha.n;
            {
                for (int c = 0; c < n; ++c)
                {
                    VectorT result1 = resultStorage[RowColToOffset(0, c, n)];
                    int32_t firstHorizontal = my_hadd(result1);
                    transC[RowColToOffset(ha.startRow, c, n)] += firstHorizontal;
                }
            }
            ALIGNED_FREE(resultStorage);
        }

        static void BlockHandler32x1Thread(HandlerArgs<BlockHandlerT> ha)
        {
            VectorT* resultStorage = (VectorT*)ALIGNED_ALLOC(sizeof(VectorT) * ha.rowsPerBlock * ha.n, 16);
            memset(resultStorage, 0, sizeof(VectorT) * ha.rowsPerBlock * ha.n);
            int32_t* transC = ha.transC;

            for (int currBlock = 0; currBlock < ha.blocks; ++currBlock)
            {
                BlockHandlerT::HandleBlock32x1(currBlock, ha.startRow, ha.k,
                        ha.n, ha.newA, ha.B, 1, resultStorage);
            }

            int n = ha.n;
            {
                for (int c = 0; c < n; ++c)
                {
                    VectorT result1 = resultStorage[RowColToOffset(0, c, n)];
                    int32_t firstHorizontal = my_hadd(result1);
                    transC[RowColToOffset(ha.startRow, c, n)] += firstHorizontal;
                }
            }
            ALIGNED_FREE(resultStorage);
        }

        static void BlockHandler16x1Thread(HandlerArgs<BlockHandlerT> ha)
        {
            VectorT* resultStorage = (VectorT*)ALIGNED_ALLOC(sizeof(VectorT) * ha.rowsPerBlock * ha.n, 16);
            memset(resultStorage, 0, sizeof(VectorT) * ha.rowsPerBlock  * ha.n);
            int32_t* transC = ha.transC;

            for (int currBlock = 0; currBlock < ha.blocks; ++currBlock)
            {
                BlockHandlerT::HandleBlock16x1(currBlock, ha.startRow, ha.k, ha.n, 
                        ha.newA, ha.B, 1, resultStorage);
            }

            int n = ha.n;
            {
                for (int c = 0; c < n; ++c)
                {
                    VectorT result1 = resultStorage[RowColToOffset(0, c, n)];
                    int32_t firstHorizontal = my_hadd(result1);
                    transC[RowColToOffset(ha.startRow, c, n)] += firstHorizontal;
                }
            }
            ALIGNED_FREE(resultStorage);
        }

        static void BlockHandler8x1Thread(HandlerArgs<BlockHandlerT> ha)
        {
            __m128i* resultStorage = (__m128i*)ALIGNED_ALLOC(sizeof(__m128i) * ha.rowsPerBlock * ha.n, 64);
            memset(resultStorage, 0, sizeof(__m128i) * ha.rowsPerBlock * ha.n);
            int32_t* transC = ha.transC;

            for (int currBlock = 0; currBlock < ha.blocks; ++currBlock)
            {
                BlockHandlerT::HandleBlock8x1(currBlock, ha.startRow, ha.k, ha.n, 
                        ha.newA, ha.B,  1, resultStorage);
            }

            int n = ha.n;
            {
                for (int c = 0; c < n; ++c)
                {
                    __m128i result1 = resultStorage[RowColToOffset(0, c, n)];
                    int32_t firstHorizontal = my_hadd(result1);
                    transC[RowColToOffset(ha.startRow, c, n)] += firstHorizontal;
                }
            }
            ALIGNED_FREE(resultStorage);
        }

    public:

        //Borrowed from [https://software.intel.com/en-us/forums/intel-isa-extensions/topic/285219]
        //Results in a saturated add, i.e. in the overflow case we return int_max, and in the
        //underflow we return int_min.
        //The code is clever - it uses the _mm_blendv_ps instruction to avoid branching.
        //It's based on the fact that you can only have overflow if the signs of the operands
        //are identical and different from the sign of the result.
        //The resulting code results in matrix multiplies that are a few percent slower than the non-saturating code,
        //but we avoid the wild inaccuracies resulting from over/underflow.
        FORCEINLINE static __m128i my_adds_epi32(__m128i a, __m128i b)
        {
            __m128i int_min = _mm_set1_epi32(0x80000000);
            __m128i int_max = _mm_set1_epi32(0x7FFFFFFF);
            __m128i res = _mm_add_epi32(a, b);
            __m128i sign_and = _mm_and_si128(a, b);
            __m128i sign_or = _mm_or_si128(a, b);
            __m128i min_sat_mask = _mm_andnot_si128(res, sign_and);
            __m128i max_sat_mask = _mm_andnot_si128(sign_or, res);
            __m128 res_temp = _mm_blendv_ps(_mm_castsi128_ps(res), _mm_castsi128_ps(int_min), _mm_castsi128_ps(min_sat_mask));
            return _mm_castps_si128(_mm_blendv_ps(res_temp, _mm_castsi128_ps(int_max), _mm_castsi128_ps(max_sat_mask)));
        }

        //Saturated horizontal add
        FORCEINLINE static int32_t my_hadd(__m128i hAddMe)
        {
            //We have ABCD, we want A + B + C +D.
            //Shuffle to get BADC
            __m128i shuff1 = _mm_shuffle_epi32(hAddMe, _MM_SHUFFLE(1, 0, 3, 2));
            __m128i res1 = my_adds_epi32(hAddMe, shuff1);
            //Now we have
            //A+B B+A, C+D D+C, so shuffle and add once more
            __m128i shuff2 = _mm_shuffle_epi32(res1, _MM_SHUFFLE(0, 1, 2, 3));
            //This gives us
            //D+C A+B B+A C+D
            __m128i res2 = my_adds_epi32(res1, shuff2);
            return _mm_extract_epi32(res2, 0);
        }

#ifdef SUPPORT_AVX2
        //Same as above, for AVX registers
        FORCEINLINE static __m256i my_adds_epi32(__m256i a, __m256i b)
        {
            __m256i int_min = _mm256_set1_epi32(0x80000000);
            __m256i int_max = _mm256_set1_epi32(0x7FFFFFFF);
            __m256i res = _mm256_add_epi32(a, b);
            __m256i sign_and = _mm256_and_si256(a, b);
            __m256i sign_or = _mm256_or_si256(a, b);
            __m256i min_sat_mask = _mm256_andnot_si256(res, sign_and);
            __m256i max_sat_mask = _mm256_andnot_si256(sign_or, res);
            __m256 res_temp = _mm256_blendv_ps(_mm256_castsi256_ps(res), 
                    _mm256_castsi256_ps(int_min), _mm256_castsi256_ps(min_sat_mask));
            return _mm256_castps_si256(_mm256_blendv_ps(res_temp, _mm256_castsi256_ps(int_max), _mm256_castsi256_ps(max_sat_mask)));
        }

        //Same as above, for AVX registers
        FORCEINLINE static int32_t my_hadd(__m256i hAddMe)
        {
            __m256i shuff1 = _mm256_shuffle_epi32(hAddMe, _MM_SHUFFLE(1, 0, 3, 2));
            __m256i res1 = my_adds_epi32(hAddMe, shuff1);
            __m256i shuff2 = _mm256_shuffle_epi32(res1, _MM_SHUFFLE(0, 1, 2, 3));
            __m256i res2 = my_adds_epi32(res1, shuff2);
            __m256i shifted = _mm256_permute2f128_si256(res2, res2, 1);
            __m256i res3 = my_adds_epi32(res2, shifted);
            union {
                int32_t i[8]; __m256i v;
            } u;
            u.v = res3;
            return u.i[0];
        }
#endif


        int m_numThreads;

        BlockMultiplier(int numThreads = 1) 
        {
            SetNumThreads(numThreads);
        }

        void SetNumThreads(int threads)
        {
            m_numThreads = threads;
#ifdef STDTHREAD
            m_pPool.reset(new StdThreadPool<HandlerArgs<BlockHandlerT>>(threads));
#else
#ifdef OPENMPTHREAD
            omp_set_num_threads(threads);
#endif
#endif
        }

        ~BlockMultiplier()
        {
            BlockHandlerT::FreePreparedB(m_pBlockHandlerBInfo);
        }
        static ScalarAT* CreateMatrixA(int m, int n, ScalarAT initVal = 0);
        static ScalarBT* CreateMatrixB(int m, int n, ScalarBT initVal = 0);
        static int32_t* CreateMatrixC(int m, int n, int32_t initVal = 0);
        ScalarBT* PrepareB(ScalarBT* oldB, int k, int n);
        template<typename ScalarT> static void FreeMatrix(ScalarT* freeMe) { FreeAlignedMatrix<ScalarT>(freeMe); }
        //We assume B has been rewritten in block order.
        //For now we assume m, k and n are all multiples of kernelsize.
        void MultiplyMatrices(ScalarAT* A, int m, int k, ScalarBT* B, int n, int32_t* C, ScalarAT alpha = 1, ScalarBT beta = 0);
        static const int MAXRANGE = 1 << 13;
};

// Instantiate block multipliers
template<> const int BlockMultiplier<BlockHandlerSSE>::MAXRANGE;
#ifdef SUPPORT_AVX2
template<> const int BlockMultiplier<BlockHandlerAVX>::MAXRANGE;
#endif


template<typename BlockHandlerT> typename BlockMultiplier<BlockHandlerT>::ScalarAT* BlockMultiplier<BlockHandlerT>::CreateMatrixA(int m, int n, ScalarAT initVal)
{
    return CreateAlignedMatrix<ScalarAT>(m, n, initVal);
}
template<typename BlockHandlerT> typename BlockMultiplier<BlockHandlerT>::ScalarBT* BlockMultiplier<BlockHandlerT>::CreateMatrixB(int m, int n, ScalarBT initVal)
{
    return CreateAlignedMatrix<ScalarBT>(m, n, initVal);
}
template<typename BlockHandlerT> int32_t* BlockMultiplier<BlockHandlerT>::CreateMatrixC(int m, int n, int32_t initVal)
{
    return CreateAlignedMatrix<int32_t>(m, n, initVal);
}


// reference kernel to get the algo right
// If you make any changes to the block algorithm, test it with this.
template<typename BlockHandlerT> FORCEINLINE int BlockMultiplier<BlockHandlerT>::referenceKernel(ScalarAT* A, 
        ScalarBT* B, int blockSize)
{
    //For now just use regular instructions
    //until we have the breakdown figured out.
    int accum = 0;
    for (int i = 0; i < blockSize; ++i)
    {
        accum += A[i] * B[i];
    }
    return accum;
}

//Rewrites B in Block order so that memory accesses to B will be sequential.
//See comments on RewriteBInBlockOrder for details.
template<typename BlockHandlerT> typename BlockMultiplier<BlockHandlerT>::ScalarBT* BlockMultiplier<BlockHandlerT>::PrepareB(ScalarBT* oldB, int k, int n)
{
    ScalarBT* newB = CreateMatrixB(k, n);
    int offset = 0;

    ScalarBT* next = RewriteBInBlockOrder(oldB, newB, k, n, 128, &offset);
    if (offset < k)
    {
        next = RewriteBInBlockOrder(oldB, next, k, n, 64, &offset);
    }
    if (offset < k)
    {
        next = RewriteBInBlockOrder(oldB, next, k, n, 32, &offset);
    }
    if (offset < k)
    {
        next = RewriteBInBlockOrder(oldB, next, k, n, 16, &offset);
    }
    if (offset < k)
    {
        next = RewriteBInBlockOrder(oldB, next, k, n, 8, &offset);
    }
    if (offset < k)
    {
        int blockSize = k - offset;
        next = RewriteBInBlockOrder(oldB, next, k, n, blockSize, &offset);
    }
    assert(next - newB == k * n);
    m_pBlockHandlerBInfo = BlockHandlerT::PrepareExtraB(newB, k, n);

    return newB;
}

//A version of RewriteBInBlockOrder that allows for multiple decreasing block sizes
//(Which is necessary for letting us handle arbitrarily-sized matrices). On the first call, set kOffset to zero, and
//newB to the beginning of the newly allocated B array.
//We return a pointer to the next writable element in newB, and kOffset gets the new offset into the shared dimension. 

//So if you have blocks of size 128 and 64, do something like this:
//short* newB = alloc_b();
//int kOffset=0;
//int retBlockSize = firstBlockSize;
//(Write out the blocks up to k=128)
//short* nextB = RewriteBInBlockOrder(oldB, newB, k, n, 128, *kOffset);
//short* lastB = RewriteBInBlockOrder(oldB, nextB, k, n, 64, &kOffset);

//For each block, the logic is as follows:
//Given a block size of 2 and a matrix that looks like this:
// B00 B01 B02 B03
// B10 B11 B12 B13
// B20 B21 B22 B23
// B30 B31 B32 B33
//.. we will rewrite it like this
// B00 B10 B01 B11 B02 B12 B03 B13 //The first block of columns 0,1,2, and 3.
//B20 B30 B21 B31 B22 B32 B23 B33 //The second block of columns, 0, 1, 2, and 3.

//So each "row" of the new matrix (block size * num_original_cols) contains all of the values that
//will be multiplied with block zero of a given row.
//Column offsets within the row can be computed by (block_size * col_offset).

//Given the original B matrix, we restructure it so that we will access all of its elements in block order.
//That way we can load in blocksize elements from A, then read in the corresponding elements from each
//column of B in sequential order.
//Old, replaced with fn below, keeping around for now for reference.
template<typename BlockHandlerT> typename BlockMultiplier<BlockHandlerT>::ScalarBT* BlockMultiplier<BlockHandlerT>::RewriteBInBlockOrder(ScalarBT* oldB, ScalarBT* newB, int k, 
        int n, int blockSize, int* kOffset)
{
    ScalarBT* curr = newB;
    int numBlocks = (k - *kOffset) / blockSize;
    for (int b = 0; b < numBlocks; ++b)
    {
        //row offset to beginning of block
        int blockOffset = *kOffset + (b * blockSize);
        for (int c = 0; c < n; ++c)
        {
            for (int rowOffset = 0; rowOffset < blockSize; ++rowOffset)
            {
                int idx = RowColToOffset(blockOffset + rowOffset, c, n);
                *curr++ = oldB[idx];
            }
        }
    }
    //Bump offset for next level down
    *kOffset += (numBlocks * blockSize);
    //Return ptr to next value to be written
    return curr;
}

//We can also reorder A, pulling in multiple rows at once.
//So while the input looks like this in memory:
// Row1Block1 Row1Block2 Row1Block3
// Row2Block1 Row2Block2 Row2Block3
// Row3Block1 Row3Block2 Row3Block3
// Row4Block1 Row4Block2 Row4Block3
//
// we want the rewritten array to look like this (assuming 2 rows per block):
//Row1Block1 Row2Block1 Row1Block2 Row2Block2 Row1Block3 Row2Block3
//Row3Block1 Row4Block1 Row3Block2 Row4Block2 Row3Block3 Row4Block3

//length = 2 (rows per block) * 3 (blocks per row)

//So the first "row" contains the first rowsPerBlock rows in block order.
//We can index into the start of block b in a row via (b * blockSize * rowsPerBlock)
//blockSize is the size of our dot product kernel.
template<typename BlockHandlerT> void BlockMultiplier<BlockHandlerT>::RewriteAInBlockOrder(ScalarAT* A, ScalarAT* newA, int m,
        int k, int /*blockSize*/, int rowsPerBlock)
{
    int currOffset = 0;
    ScalarAT* next = RewriteAInBlockOrder(A, newA, m, k, 128, rowsPerBlock, &currOffset);
    if (currOffset < k)
    {
        next = RewriteAInBlockOrder(A, next, m, k, 64, rowsPerBlock, &currOffset);
    }
    if (currOffset < k)
    {
        next = RewriteAInBlockOrder(A, next, m, k, 32, rowsPerBlock, &currOffset);
    }
    if (currOffset < k)
    {
        next = RewriteAInBlockOrder(A, next, m, k, 16, rowsPerBlock, &currOffset);
    }
    if (currOffset < k)
    {
        next = RewriteAInBlockOrder(A, next, m, k, 8, rowsPerBlock, &currOffset);
    }
    if (currOffset < k)
    {
        int blockSize = k - currOffset;
        next = RewriteAInBlockOrder(A, next, m, k, blockSize, rowsPerBlock, &currOffset);
    }
}

template<typename BlockHandlerT> typename BlockMultiplier<BlockHandlerT>::ScalarAT* BlockMultiplier<BlockHandlerT>::RewriteAInBlockOrder(ScalarAT* A, ScalarAT* newA, int m, int k, 
        int blockSize, int rowsPerBlock, int* pKOffset)
{
    ScalarAT* currOut = newA;
    int blocksPerRow = (k - *pKOffset) / blockSize;
    for (int startRow = 0; startRow < m; startRow += rowsPerBlock)
    {
        for (int currBlock = 0; currBlock < blocksPerRow; ++currBlock)
        {
            for (int rowOffset = 0; rowOffset < rowsPerBlock; ++rowOffset)
            {
                int startCol = *pKOffset + (currBlock * blockSize);
                for (int c = 0; c < blockSize; ++c)
                {
                    *currOut++ = A[RowColToOffset(startRow + rowOffset, startCol + c, k)];
                }
            }
        }
    }
    *pKOffset += blocksPerRow * blockSize;
    return currOut;
}


//I tried parallelizing this and got no speedup - that doesn't really make sense though.
template<typename BlockHandlerT> typename BlockMultiplier<BlockHandlerT>::ScalarAT* BlockMultiplier<BlockHandlerT>::RewriteAInBlockOrder2(ScalarAT* A, ScalarAT* newA, 
        int m, int k, int blockSize, int rowsPerBlock, int* pKOffset)
{
    ScalarAT* currOut = newA;
    int blocksPerRow = (k - *pKOffset) / blockSize;
    //#pragma omp parallel for
    for (int startRow = 0; startRow < m; startRow += rowsPerBlock)
    {
        int offset3 = startRow * blocksPerRow * (rowsPerBlock/4) * blockSize;
        for (int currBlock = 0; currBlock < blocksPerRow; ++currBlock)
        {
            int outOffset2 = currBlock * blockSize * rowsPerBlock;
            for (int rowOffset = 0; rowOffset < rowsPerBlock; ++rowOffset)
            {
                int outOffset = rowOffset * blockSize;
                int startCol = *pKOffset + (currBlock * blockSize);

                for (int c = 0; c < blockSize; ++c)
                {
                    *(currOut + outOffset + outOffset2 + offset3 + c) = A[RowColToOffset(startRow + rowOffset, startCol + c, k)];
                }
            }
        }
    }

    currOut += blockSize * rowsPerBlock * blocksPerRow * (m/rowsPerBlock);
    *pKOffset += blocksPerRow * blockSize;
    return currOut;
}


template<typename BlockHandlerT>struct BlockInfo
{
    BlockInfo(int cnt, int setK, int offsetASet, int offsetBSet, std::function<void(HandlerArgs<BlockHandlerT>)> fourFnSet,
            std::function<void(HandlerArgs<BlockHandlerT>)> oneFnSet)
        : blockCnt(cnt), k(setK), offsetA(offsetASet), offsetB(offsetBSet), fourFn(fourFnSet), oneFn(oneFnSet) {}

    int blockCnt;
    int k;
    int offsetA;
    int offsetB;
    std::function<void (HandlerArgs<BlockHandlerT>)> fourFn;
    std::function<void (HandlerArgs<BlockHandlerT>)> oneFn;
};

//We assume B has been rewritten in block order.
//For now we assume m, k and n are all multiples of kernelsize.
template<typename BlockHandlerT> void BlockMultiplier<BlockHandlerT>::MultiplyMatrices(ScalarAT* A, int m, int k, ScalarBT* B, int n,
        int32_t* C, ScalarAT alpha, ScalarBT beta)
{

    // The lock here is required whenever you are calling MultiplyMatrices on the same BlockMultiplier object
    // (For instance you are simultaneously multiplying two different inputs by the same first layer weight matrix).
    // It is an artifact of the thread pool I am using when STDTHREAD is defined and may not be necessary
    // in the openmp case, but I have not tested this scenario so I'm leaving it in for now.
    std::lock_guard<std::mutex> lock(m_MultiplyMut);
    {
        int rowsPerBlock = m / m_numThreads;
        if (rowsPerBlock < 4)
            rowsPerBlock = 1;
        if (rowsPerBlock > 4)
            rowsPerBlock = 4;

        if (m % rowsPerBlock != 0)
        {
            throw new std::runtime_error("Error: m must be < 4 or a multiple of 4.");
        }

        if (alpha != 1 || beta != 0)
        {
            throw std::logic_error("alpha / beta not yet implemented for this class");
        }

        ScalarAT* newA = CreateMatrixA(m, k);

        RewriteAInBlockOrder(A, newA, m, k, m_blockSize, rowsPerBlock);

        int blocks128 = k / 128;
        int k128 = blocks128 * 128;
        int blocks64 = (k - k128) / 64;
        int k64 = blocks64 * 64;
        int blocks32 = (k - k128 - k64) / 32;
        int k32 = blocks32 * 32;
        int blocks16 = (k - k128 - k64 - k32) / 16;
        int k16 = blocks16 * 16;
        int blocks8 = (k - k128 - k64 - k32 - k16) / 8;
        int k8 = blocks8 * 8;
        int blocks1 = (k - k128 - k64 - k32 - k16 - k8);

        int offsetA64 = m * blocks128 * 128;
        int offsetB64 = n * blocks128 * 128;
        int offsetA32 = offsetA64 + (m * k64);
        int offsetB32 = offsetB64 + (n * k64);
        int offsetA16 = offsetA32 + (m * k32);
        int offsetB16 = offsetB32 + (n * k32);
        int offsetA8 = offsetA16 + (m * k16);
        int offsetB8 = offsetB16 + (n * k16);

        int offsetA1 = offsetA8 + (m * k8);
        int offsetB1 = offsetB8 + (n * k8);


        BlockHandler128x4Fn fn128x4;
        BlockHandler64x4Fn fn64x4;
        BlockHandler32x4Fn fn32x4;
        BlockHandler16x4Fn fn16x4;
        BlockHandler8x4Fn fn8x4;
        BlockHandler128x1Fn fn128x1;
        BlockHandler64x1Fn fn64x1;
        BlockHandler32x1Fn fn32x1;
        BlockHandler16x1Fn fn16x1;
        BlockHandler8x1Fn fn8x1;

        std::vector<BlockInfo<BlockHandlerT>*> blockInfos(5);


        //std::function<void __cdecl(HandlerArgs<BlockHandlerT>)> fourFn = BlockHandler128x4Thread;
        //std::function<void __cdecl(HandlerArgs<BlockHandlerT>)> oneFn = BlockHandler128x1Thread;

        BlockInfo<BlockHandlerT> bi128(blocks128, k128, 0, 0, fn128x4, fn128x1);
        BlockInfo<BlockHandlerT> bi64(blocks64, k64, offsetA64, offsetB64, fn64x4, fn64x1);
        BlockInfo<BlockHandlerT> bi32(blocks32, k32, offsetA32, offsetB32, fn32x4, fn32x1);
        BlockInfo<BlockHandlerT> bi16(blocks16, k16, offsetA16, offsetB16, fn16x4, fn16x1);
        BlockInfo<BlockHandlerT> bi8(blocks8, k8, offsetA8, offsetB8, fn8x4, fn8x1);

        blockInfos[0] = &bi128;
        blockInfos[1] = &bi64;
        blockInfos[2] = &bi32;
        blockInfos[3] = &bi16;
        blockInfos[4] = &bi8;
        for (int i = 0; i < blockInfos.size(); ++i)
        {
            BlockInfo<BlockHandlerT>& currBlockInfo = *(blockInfos[i]);
            if ( currBlockInfo.blockCnt > 0)
            {
                HandlerArgs<BlockHandlerT> ha;


                ha.blocks = currBlockInfo.blockCnt;
                ha.m = m;
                ha.k = currBlockInfo.k;
                ha.n = n;
                ha.newA = newA + currBlockInfo.offsetA;
                ha.B = B + currBlockInfo.offsetB;
                ha.transC = C;

                ha.rowsPerThread = m / m_numThreads;
                ha.rowsPerBlock = rowsPerBlock;
                ha.pBlockPreparedB = m_pBlockHandlerBInfo;

                if (rowsPerBlock == 4)
                {

#ifdef OPENMPTHREAD
#pragma omp parallel for
#endif
                    for (int startRow = 0; startRow < m; startRow += 4)
                    {
                        ha.startRow = startRow;
#ifdef STDTHREAD
                        m_pPool->QueueAndWake(ha, currBlockInfo.fourFn);
#else
#ifdef OPENMPTHREAD
                        currBlockInfo.fourFn(ha);
#endif
#endif
                    }

#ifdef STDTHREAD 
                    m_pPool->Drain();
#endif
                }

                else if (rowsPerBlock == 1)
                {
#ifdef OPENMPTHREAD
#pragma omp parallel for
#endif
                    for (int startRow = 0; startRow < m; ++startRow)
                    {
                        ha.startRow = startRow;
#ifdef STDTHREAD
                        m_pPool->QueueAndWake(ha, currBlockInfo.oneFn);
#else
#ifdef OPENMPTHREAD
                        currBlockInfo.oneFn(ha);
#endif
#endif
                    }
#ifdef STDTHREAD
                    m_pPool->Drain();
#endif
                }
                else
                {
                    throw std::runtime_error("Illegal setting for rowsPerBlock");
                }

            }

        }

        if (blocks1 > 0)
        {
            ScalarAT* pA = newA + offsetA1;
            for (int startRow = 0; startRow < m; startRow++)
            {
                ScalarBT* pB = B + offsetB1;
                for (int c = 0; c < n; ++c)
                {
                    C[RowColToOffset(startRow, c, n)] += referenceKernel(pA, pB, blocks1);
                    pB += blocks1;
                }
                pA += blocks1;

            }
        }

        FreeAlignedMatrix(newA);
    }
}



}}}//end namespace