Raw File
// xts.cpp - written and placed in the public domain by Jeffrey Walton

// Aarch32, Aarch64, Altivec and X86_64 include SIMD as part of the
// base architecture. We can use the SIMD code below without an
// architecture option. No runtime tests are required. Unfortunately,
// we can't use it on Altivec because an architecture switch is required.
// The updated XorBuffer gains 0.3 to 1.5 cpb on the architectures for
// 16-byte block sizes.

#include "pch.h"

#include "xts.h"
#include "misc.h"
#include "modes.h"
#include "cpu.h"

#if defined(CRYPTOPP_DEBUG)
# include "aes.h"
# include "threefish.h"

// 0.3 to 0.4 cpb profit
#if defined(__SSE2__) || defined(_M_X64)
# include <emmintrin.h>

#if defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
#  include <arm_neon.h>
# endif

#if defined(__ALTIVEC__)
# include "ppc_simd.h"


using namespace CryptoPP;


using CryptoPP::AES;
using CryptoPP::XTS_Mode;
using CryptoPP::Threefish512;

void Modes_TestInstantiations()
    XTS_Mode<AES>::Encryption m0;
    XTS_Mode<AES>::Decryption m1;
    XTS_Mode<AES>::Encryption m2;
    XTS_Mode<AES>::Decryption m3;

    XTS_Mode<Threefish512>::Encryption m4;
    XTS_Mode<Threefish512>::Decryption m5;

inline void XorBuffer(byte *output, const byte *input, const byte *mask, size_t count)
    CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0));

    xorbuf(output, input, mask, count);

#elif defined(__SSE2__) || defined(_M_X64)
    for (size_t i=0; i<count; i+=16)

#elif defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
    for (size_t i=0; i<count; i+=16)
        vst1q_u8(output+i, veorq_u8(vld1q_u8(input+i), vld1q_u8(mask+i)));

#elif defined(__ALTIVEC__)
    for (size_t i=0; i<count; i+=16)
        VecStore(VecXor(VecLoad(input+i), VecLoad(mask+i)), output+i);

    xorbuf(output, input, mask, count);

inline void XorBuffer(byte *buf, const byte *mask, size_t count)
    XorBuffer(buf, buf, mask, count);

// Borrowed from CMAC, but little-endian representation
inline void GF_Double(byte *out, const byte* in, unsigned int len)
    word128 carry = 0, x;
    for (size_t i=0, idx=0; i<len/16; ++i, idx+=16)
        x = GetWord<word128>(false, LITTLE_ENDIAN_ORDER, in+idx);
        word128 y = (x >> 127); x = (x << 1) + carry;
        PutWord<word128>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
        carry = y;
#elif defined(_M_X64) || defined(_M_ARM64) || defined(_LP64) || defined(__LP64__)
    word64 carry = 0, x;
    for (size_t i=0, idx=0; i<len/8; ++i, idx+=8)
        x = GetWord<word64>(false, LITTLE_ENDIAN_ORDER, in+idx);
        word64 y = (x >> 63); x = (x << 1) + carry;
        PutWord<word64>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
        carry = y;
    word32 carry = 0, x;
    for (size_t i=0, idx=0; i<len/4; ++i, idx+=4)
        x = GetWord<word32>(false, LITTLE_ENDIAN_ORDER, in+idx);
        word32 y = (x >> 31); x = (x << 1) + carry;
        PutWord<word32>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
        carry = y;


    CRYPTOPP_ASSERT(len >= 16);
    CRYPTOPP_ASSERT(len <= 128);

    byte* k = out;
    if (carry)
        switch (len)
        case 16:
            const size_t LEIDX = 16-1;
            k[LEIDX-15] ^= 0x87;
        case 32:
            // Polynomial x^256 + x^10 + x^5 + x^2 + 1
            const size_t LEIDX = 32-1;
            k[LEIDX-30] ^= 4;
            k[LEIDX-31] ^= 0x25;
        case 64:
            // Polynomial x^512 + x^8 + x^5 + x^2 + 1
            const size_t LEIDX = 64-1;
            k[LEIDX-62] ^= 1;
            k[LEIDX-63] ^= 0x25;
        case 128:
            // Polynomial x^1024 + x^19 + x^6 + x + 1
            const size_t LEIDX = 128-1;
            k[LEIDX-125] ^= 8;
            k[LEIDX-126] ^= 0x00;
            k[LEIDX-127] ^= 0x43;
    CRYPTOPP_ASSERT(len == 16);

    byte* k = out;
    if (carry)
        k[0] ^= 0x87;

inline void GF_Double(byte *inout, unsigned int len)
    GF_Double(inout, inout, len);



void XTS_ModeBase::ThrowIfInvalidBlockSize(size_t length)
    CRYPTOPP_ASSERT(length >= 16 && length <= 128 && IsPowerOf2(length));
    if (length < 16 || length > 128 || !IsPowerOf2(length))
        throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not valid");
    CRYPTOPP_ASSERT(length == 16);
    if (length != 16)
        throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16");

void XTS_ModeBase::ThrowIfInvalidKeyLength(size_t length)
    CRYPTOPP_ASSERT(length % 2 == 0);
    if (!GetBlockCipher().IsValidKeyLength((length+1)/2))
        throw InvalidKeyLength(AlgorithmName(), length);

void XTS_ModeBase::SetKey(const byte *key, size_t length, const NameValuePairs &params)

    const size_t klen = length/2;
    AccessBlockCipher().SetKey(key+0, klen, params);
    AccessTweakCipher().SetKey(key+klen, klen, params);


    size_t ivLength;
    const byte *iv = GetIVAndThrowIfInvalid(params, ivLength);
    Resynchronize(iv, (int)ivLength);

void XTS_ModeBase::Resynchronize(const byte *iv, int ivLength)
    BlockOrientedCipherModeBase::Resynchronize(iv, ivLength);
    std::memcpy(m_xregister, m_register, ivLength);

void XTS_ModeBase::Resynchronize(word64 sector, ByteOrder order)
    SecByteBlock iv(GetTweakCipher().BlockSize());
    PutWord<word64>(false, order, iv, sector);
    std::memset(iv+8, 0x00, iv.size()-8);

    BlockOrientedCipherModeBase::Resynchronize(iv, (int)iv.size());
    std::memcpy(m_xregister, iv, iv.size());

void XTS_ModeBase::ResizeBuffers()

// ProcessData runs either 12-4-1 blocks, 8-2-1 or 4-1 blocks. Which is
// selected depends on ParallelBlocks in the header file. 12-4-1 or 8-2-1
// can be used on Aarch64 and PowerPC. Intel should use 4-1 due to lack
// of registers. The unneeded code paths should be removed by optimizer.
// The extra gyrations save us 1.8 cpb on Aarch64 and 2.1 cpb on PowerPC.
void XTS_ModeBase::ProcessData(byte *outString, const byte *inString, size_t length)
    // data unit is multiple of 16 bytes
    CRYPTOPP_ASSERT(length % BlockSize() == 0);

    enum { lastParallelBlock = ParallelBlocks-1 };
    const unsigned int blockSize = GetBlockCipher().BlockSize();
    const size_t parallelSize = blockSize*ParallelBlocks;

    // encrypt the data unit, optimal size at a time
    while (length >= parallelSize)
        // m_xregister[0] always points to the next tweak.
        GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
        GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
        GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);

        if (ParallelBlocks > 4)
            GF_Double(m_xregister+4*blockSize, m_xregister+3*blockSize, blockSize);
            GF_Double(m_xregister+5*blockSize, m_xregister+4*blockSize, blockSize);
            GF_Double(m_xregister+6*blockSize, m_xregister+5*blockSize, blockSize);
            GF_Double(m_xregister+7*blockSize, m_xregister+6*blockSize, blockSize);
        if (ParallelBlocks > 8)
            GF_Double(m_xregister+8*blockSize, m_xregister+7*blockSize, blockSize);
            GF_Double(m_xregister+9*blockSize, m_xregister+8*blockSize, blockSize);
            GF_Double(m_xregister+10*blockSize, m_xregister+9*blockSize, blockSize);
            GF_Double(m_xregister+11*blockSize, m_xregister+10*blockSize, blockSize);

        // merge the tweak into the input block
        XorBuffer(m_xworkspace, inString, m_xregister, parallelSize);

        // encrypt one block, merge the tweak into the output block
        GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
            outString, parallelSize, BlockTransformation::BT_AllowParallel);

        // m_xregister[0] always points to the next tweak.
        GF_Double(m_xregister+0, m_xregister+lastParallelBlock*blockSize, blockSize);

        inString += parallelSize;
        outString += parallelSize;
        length -= parallelSize;

    // encrypt the data unit, 4 blocks at a time
    while (ParallelBlocks == 12 && length >= blockSize*4)
        // m_xregister[0] always points to the next tweak.
        GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
        GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
        GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);

        // merge the tweak into the input block
        XorBuffer(m_xworkspace, inString, m_xregister, blockSize*4);

        // encrypt one block, merge the tweak into the output block
        GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
            outString, blockSize*4, BlockTransformation::BT_AllowParallel);

        // m_xregister[0] always points to the next tweak.
        GF_Double(m_xregister+0, m_xregister+3*blockSize, blockSize);

        inString += blockSize*4;
        outString += blockSize*4;
        length -= blockSize*4;

    // encrypt the data unit, 2 blocks at a time
    while (ParallelBlocks == 8 && length >= blockSize*2)
        // m_xregister[0] always points to the next tweak.
        GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);

        // merge the tweak into the input block
        XorBuffer(m_xworkspace, inString, m_xregister, blockSize*2);

        // encrypt one block, merge the tweak into the output block
        GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
            outString, blockSize*2, BlockTransformation::BT_AllowParallel);

        // m_xregister[0] always points to the next tweak.
        GF_Double(m_xregister+0, m_xregister+1*blockSize, blockSize);

        inString += blockSize*2;
        outString += blockSize*2;
        length -= blockSize*2;

    // encrypt the data unit, blocksize at a time
    while (length)
        // merge the tweak into the input block
        XorBuffer(m_xworkspace, inString, m_xregister, blockSize);

        // encrypt one block

        // merge the tweak into the output block
        XorBuffer(outString, m_xworkspace, m_xregister, blockSize);

        // Multiply T by alpha
        GF_Double(m_xregister, blockSize);

        inString += blockSize;
        outString += blockSize;
        length -= blockSize;

size_t XTS_ModeBase::ProcessLastBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
    // need at least a full AES block
    CRYPTOPP_ASSERT(inLength >= BlockSize());

    if (inLength < BlockSize())
        throw InvalidArgument("XTS: message is too short for ciphertext stealing");

    if (IsForwardTransformation())
        return ProcessLastPlainBlock(outString, outLength, inString, inLength);
        return ProcessLastCipherBlock(outString, outLength, inString, inLength);

size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
    // ensure output buffer is large enough
    CRYPTOPP_ASSERT(outLength >= inLength);

    const unsigned int blockSize = GetBlockCipher().BlockSize();
    const size_t blocks = inLength / blockSize;
    const size_t tail = inLength % blockSize;
    outLength = inLength;

    if (tail == 0)
        // Allow ProcessData to handle all the full blocks
        ProcessData(outString, inString, inLength);
        return inLength;
    else if (blocks > 1)
        // Allow ProcessData to handle full blocks except one
        const size_t head = (blocks-1)*blockSize;
        ProcessData(outString, inString, inLength-head);

        outString += head;
        inString  += head; inLength -= head;

    ///// handle the full block /////

    // merge the tweak into the input block
    XorBuffer(m_xworkspace, inString, m_xregister, blockSize);

    // encrypt one block

    // merge the tweak into the output block
    XorBuffer(outString, m_xworkspace, m_xregister, blockSize);

    // Multiply T by alpha
    GF_Double(m_xregister, blockSize);

    ///// handle final partial block /////

    inString += blockSize;
    outString += blockSize;
    const size_t len = inLength-blockSize;

    // copy in the final plaintext bytes
    std::memcpy(m_xworkspace, inString, len);
    // and copy out the final ciphertext bytes
    std::memcpy(outString, outString-blockSize, len);
    // "steal" ciphertext to complete the block
    std::memcpy(m_xworkspace+len, outString-blockSize+len, blockSize-len);

    // merge the tweak into the input block
    XorBuffer(m_xworkspace, m_xregister, blockSize);

    // encrypt one block

    // merge the tweak into the previous output block
    XorBuffer(outString-blockSize, m_xworkspace, m_xregister, blockSize);

    return outLength;

size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
    // ensure output buffer is large enough
    CRYPTOPP_ASSERT(outLength >= inLength);

    const unsigned int blockSize = GetBlockCipher().BlockSize();
    const size_t blocks = inLength / blockSize;
    const size_t tail = inLength % blockSize;
    outLength = inLength;

    if (tail == 0)
        // Allow ProcessData to handle all the full blocks
        ProcessData(outString, inString, inLength);
        return inLength;
    else if (blocks > 1)
        // Allow ProcessData to handle full blocks except one
        const size_t head = (blocks-1)*blockSize;
        ProcessData(outString, inString, inLength-head);

        outString += head;
        inString  += head; inLength -= head;

    #define poly1 (m_xregister+0*blockSize)
    #define poly2 (m_xregister+1*blockSize)
    GF_Double(poly2, poly1, blockSize);

    ///// handle final partial block /////

    inString += blockSize;
    outString += blockSize;
    const size_t len = inLength-blockSize;

    // merge the tweak into the input block
    XorBuffer(m_xworkspace, inString-blockSize, poly2, blockSize);

    // encrypt one block

    // merge the tweak into the output block
    XorBuffer(m_xworkspace, poly2, blockSize);

    // copy in the final plaintext bytes
    std::memcpy(outString-blockSize, inString, len);
    // and copy out the final ciphertext bytes
    std::memcpy(outString, m_xworkspace, len);
    // "steal" ciphertext to complete the block
    std::memcpy(outString-blockSize+len, m_xworkspace+len, blockSize-len);

    ///// handle the full previous block /////

    inString -= blockSize;
    outString -= blockSize;

    // merge the tweak into the input block
    XorBuffer(m_xworkspace, outString, poly1, blockSize);

    // encrypt one block

    // merge the tweak into the output block
    XorBuffer(outString, m_xworkspace, poly1, blockSize);

    return outLength;

back to top