https://github.com/nigma/pywt
Raw File
Tip revision: 035e1fa14c2cd70ca270da20b1523e834a7ae635 authored by Filip Wasilewski on 29 July 2015, 18:25:19 UTC
Update readme to indicate https://github.com/PyWavelets as the new project home
Tip revision: 035e1fa
convolution.template.c
// Copyright (c) 2006-2012 Filip Wasilewski <http://en.ig.ma/>
// See COPYING for license details.

#include "convolution.h"

//## FOR $DTYPE$ IN (double, float):

int $DTYPE$_downsampling_convolution_periodization(const $DTYPE$* input, const_index_t N,
                             const $DTYPE$* filter, const_index_t F,
                             $DTYPE$* output, const_index_t step)

{

    index_t i, j, k, F_2, corr;
    index_t start;
    $DTYPE$ sum;
    $DTYPE$* ptr_w = output;

    i = step-1; // first element taken from input is input[step-1]
    //extending by (F-2)/2 elements
    start = F_2 = F/2;
    corr = 0;

    for(i=start; i < F; i+=step){
        sum = 0;
        for(j = 0; j < i+1-corr; ++j)    // overlapping
            sum += filter[j]*input[i-j-corr];

        if(N%2){
            if(F-j){ // if something to extend
                sum += filter[j] * input[N-1];
                if(F-j){
                    for(k = 2-corr; k <= F-j; ++k)
                        sum += filter[j-1+k] * input[N-k+1];
                }
            }
        } else { // extra element from input   -> i0 i1 i2 [i2]
            for(k = 1; k <= F-j; ++k)
                sum += filter[j-1+k] * input[N-k];
        }
        *(ptr_w++) = sum;
    }

    // F - N-1        - filter in input range
    // most time is spent in this loop
    for(; i < N; i+=step){                    // input elements,
        sum = 0;
        for(j = 0; j < F; ++j)
            sum += input[i-j]*filter[j];
        *(ptr_w++) = sum;
    }

    for(; i < N-step + (F/2)+1 + N%2; i += step){    // input elements
        sum = 0;
        k = i-N+1;
        for(j = k; j < F; ++j)                // overlapped elements
            sum += filter[j]*input[i-j];

        if(N%2 == 0){
            for(j = 0; j < k; ++j){            // out of boundary (filter elements [0, k-1])
                sum += filter[j]*input[k-1-j];
            }
        } else {                            // repeating extra element -> i0 i1 i2 [i2]
            for(j = 0; j < k-1; ++j)        // out of boundary (filter elements [0, k-1])
                sum += filter[j]*input[k-2-j];
            sum += filter[k-1] * input[N-1];
        }
        *(ptr_w++) = sum;
    }
    return 0;
}


int $DTYPE$_downsampling_convolution(const $DTYPE$* input, const_index_t N,
                             const $DTYPE$* filter, const_index_t F,
                             $DTYPE$* output,
                             const_index_t step, MODE mode)
{

    // This convolution performs efficient downsampling by computing every step'th
    // element of normal convolution (currently tested only for step=1 and step=2).
    //
    // It also implements several different strategies of dealing with border
    // distortion problem (the problem of computing convolution for not existing
    // elements of signal). To handle this the signal has to be "extended" on both
    // sides by computing the missing values.
    //
    // General schema is as follows:
    // 1. Handle extended on the left, convolve filter with samples computed for time < 0
    // 2. Do the normal decimated convolution of filter with signal samples
    // 3. Handle extended on the right, convolve filter with samples computed for time > n-1


    index_t i, j, k;
    index_t start;
    $DTYPE$ sum, tmp;
    #ifdef OPT_UNROLL2
        $DTYPE$ sum2;
    #endif
    #ifdef OPT_UNROLL4
        #ifndef OPT_UNROLL2
            $DTYPE$ sum2;
        #endif
        $DTYPE$ sum3, sum4;
    #endif

    $DTYPE$* ptr_w = output;

    i = start = step-1; // first element taken from input is input[step-1]

    if(F <= N){

        if(mode == MODE_PERIODIZATION){
            return $DTYPE$_downsampling_convolution_periodization(input, N, filter, F, output, step);

        // Other signal extension modes
        } else {

            ///////////////////////////////////////////////////////////////////////
            // 0 - F-1    - sliding in filter
            switch(mode)
            {
                case MODE_SYMMETRIC:
                    for(i=start; i < F; i+=step){
                        sum = 0;
                        for(j = 0; j <= i; ++j)
                            sum += filter[j]*input[i-j];
                        k = i+1;
                        for(j = i+1; j < F; ++j)
                            sum += filter[j] * input[j-k];
                        *(ptr_w++) = sum;
                    }
                    break;

                case MODE_ASYMMETRIC:
                    for(i=start; i < F; i+=step){
                        sum = 0;
                        for(j = 0; j <= i; ++j)
                            sum += filter[j]*input[i-j];
                        k = i+1;
                        for(j = i+1; j < F; ++j)
                            sum += filter[j] * (input[0] - input[j-k]); // -=
                        *(ptr_w++) = sum;
                    }
                    break;

                case MODE_CONSTANT_EDGE:
                    for(i=start; i < F; i+=step){
                        sum = 0;
                        for(j = 0; j <= i; ++j)
                            sum += filter[j]*input[i-j];

                        k = i+1;
                        for(j = i+1; j < F; ++j)
                            sum += filter[j] * input[0];

                        *(ptr_w++) = sum;
                    }
                    break;

                case MODE_SMOOTH:
                    for(i=start; i < F; i+=step){
                        sum = 0;
                        for(j = 0; j <= i; ++j)
                            sum += filter[j]*input[i-j];
                        tmp = input[0]-input[1];
                        for(j = i+1; j < F; ++j){
                            sum += filter[j] * (input[0] + tmp * (j-i));
                        }
                        *(ptr_w++) = sum;
                    }
                    break;

                case MODE_PERIODIC:
                    for(i=start; i < F; i+=step){
                        sum = 0;
                        for(j = 0; j <= i; ++j)
                            sum += filter[j]*input[i-j];

                        k = N+i;
                        for(j = i+1; j < F; ++j)
                            sum += filter[j] * input[k-j];

                        *(ptr_w++) = sum;
                    }
                    break;

                case MODE_ZEROPAD:
                default:
                    for(i=start; i < F; i+=step){
                        sum = 0;
                        for(j = 0; j <= i; ++j)
                            sum += filter[j]*input[i-j];
                        *(ptr_w++) = sum;
                    }
                    break;
            }

            ///////////////////////////////////////////////////////////////////////
            // F - N-1        - filter in input range - simple convolution
            // most time is spent in this loop
            //printf("entering loop\n");
            #ifdef OPT_UNROLL4
            // manually unroll the loop a bit
            //printf("F: %d, N: %d, i: %d\n", F, N, i);
            if((N - F)/step > 4) {
                for(; i < (N - (3*step)); i += 4*step){                    // input elements,
                    //printf("EO4 iter: %d\n", i);
                    sum  = input[i]          * filter[0];
                    sum2 = input[i+step]     * filter[0];
                    sum3 = input[i+(2*step)] * filter[0];
                    sum4 = input[i+(3*step)] * filter[0];
                    for(j = 1; j < F; ++j){
                        sum += input[i-j]           * filter[j];
                        sum2 += input[(step+i)-j]   * filter[j];
                        sum3 += input[(2*step+i)-j] * filter[j];
                        sum4 += input[(3*step+i)-j] * filter[j];
                    }
                    *(ptr_w++) = sum;
                    *(ptr_w++) = sum2;
                    *(ptr_w++) = sum3;
                    *(ptr_w++) = sum4;
                }
            }
            #endif

            #ifdef OPT_UNROLL2
            if((N - F)/step > 2) {
                for(; i < (N - step); i += 2*step){                    // input elements,
                    //printf("EO2 iter: %d\n", i);
                    sum  = input[i]   * filter[0];
                    sum2 = input[i+step] * filter[0];
                    for(j = 1; j < F; ++j){
                        sum += input[i-j]         * filter[j];
                        sum2 += input[(step+i)-j] * filter[j];
                    }
                    *(ptr_w++) = sum;
                    *(ptr_w++) = sum2;
                }
            }
            #endif

            for(; i < N; i+=step){                    // input elements,
                sum = input[i] * filter[0];
                for(j = 1; j < F; ++j){
                    //printf("EO1 iter: %d\n", i);
                    sum += input[i-j]*filter[j];
                }
                *(ptr_w++) = sum;
            }
            ///////////////////////////////////////////////////////////////////////
            // N - N+F-1    - sliding out filter
            switch(mode)
            {
                case MODE_SYMMETRIC:
                    for(; i < N+F-1; i += step){    // input elements
                        sum = 0;
                        k = i-N+1;                    // 1, 2, 3 // overlapped elements
                        for(j = k; j < F; ++j)                    //TODO: j < F-_offset
                            sum += filter[j]*input[i-j];

                        for(j = 0; j < k; ++j)        // out of boundary        //TODO: j = _offset
                            sum += filter[j]*input[N-k+j]; // j-i-1            0*(N-1), 0*(N-2) 1*(N-1)

                        *(ptr_w++) = sum;
                    }
                    break;

                case MODE_ASYMMETRIC:
                    for(; i < N+F-1; i += step){    // input elements
                        sum = 0;
                        k = i-N+1;
                        for(j = k; j < F; ++j)        // overlapped elements
                            sum += filter[j]*input[i-j];

                        for(j = 0; j < k; ++j)        // out of boundary
                            sum += filter[j]*(input[N-1]-input[N-k-1+j]); // -=    j-i-1
                        *(ptr_w++) = sum;
                    }
                    break;

                case MODE_CONSTANT_EDGE:
                    for(; i < N+F-1; i += step){    // input elements
                        sum = 0;
                        k = i-N+1;
                        for(j = k; j < F; ++j)        // overlapped elements
                            sum += filter[j]*input[i-j];

                        for(j = 0; j < k; ++j)        // out of boundary (filter elements [0, k-1])
                            sum += filter[j]*input[N-1]; // input[N-1] = const

                        *(ptr_w++) = sum;
                    }
                    break;

                case MODE_SMOOTH:
                    for(; i < N+F-1; i += step){    // input elements
                        sum = 0;
                        k = i-N+1; // 1, 2, 3, ...
                        for(j = k; j < F; ++j)        // overlapped elements
                            sum += filter[j]*input[i-j];

                        tmp = input[N-1]-input[N-2];
                        for(j = 0; j < k; ++j)        // out of boundary (filter elements [0, k-1])
                            sum += filter[j] * (input[N-1] + tmp * (k-j));
                        *(ptr_w++) = sum;
                    }
                    break;

                case MODE_PERIODIC:
                    for(; i < N+F-1; i += step){    // input elements
                        sum = 0;
                        k = i-N+1;
                        for(j = k; j < F; ++j)        // overlapped elements
                            sum += filter[j]*input[i-j];
                        for(j = 0; j < k; ++j)        // out of boundary (filter elements [0, k-1])
                            sum += filter[j]*input[k-1-j];
                        *(ptr_w++) = sum;
                    }
                    break;

                case MODE_ZEROPAD:
                default:
                    for(; i < N+F-1; i += step){
                        sum = 0;
                        for(j = i-(N-1); j < F; ++j)
                            sum += input[i-j]*filter[j];
                        *(ptr_w++) = sum;
                    }
                    break;
            }
        }
        return 0;

    } else {
        // reallocating memory for short signals (shorter than filter) is cheap
        return $DTYPE$_allocating_downsampling_convolution(input, N, filter, F, output, step, mode);
    }
}

///////////////////////////////////////////////////////////////////////////////
//
// like downsampling_convolution, but with memory allocation
//

int $DTYPE$_allocating_downsampling_convolution(const $DTYPE$* input, const_index_t N,
                                        const $DTYPE$* filter, const_index_t F,
                                        $DTYPE$* output,
                                        const_index_t step, MODE mode)
{
    index_t i, j, F_minus_1, N_extended_len, N_extended_right_start;
    index_t start, stop;
    $DTYPE$ sum, tmp;
    $DTYPE$ *buffer;
    $DTYPE$* ptr_w = output;

    F_minus_1 = F - 1;
    start = F_minus_1+step-1;

    // allocate memory and copy input
    if(mode != MODE_PERIODIZATION){

        N_extended_len = N + 2*F_minus_1;
        N_extended_right_start = N + F_minus_1;

        buffer = wtcalloc(N_extended_len, sizeof($DTYPE$));
        if(buffer == NULL)
            return -1;

        memcpy(buffer+F_minus_1, input, sizeof($DTYPE$) * N);
        stop = N_extended_len;

    } else {

        N_extended_len = N + F-1;
        N_extended_right_start = N-1 + F/2;

        buffer = wtcalloc(N_extended_len, sizeof($DTYPE$));
        if(buffer == NULL)
            return -1;

        memcpy(buffer+F/2-1, input, sizeof($DTYPE$) * N);

        start -= 1;

        if(step == 1)
            stop = N_extended_len-1;
        else // step == 2
            stop = N_extended_len;
    }

    // copy extended signal elements
    switch(mode){

        case MODE_PERIODIZATION:
            if(N%2){ // odd - repeat last element
                buffer[N_extended_right_start] = input[N-1];
                for(j = 1; j < F/2; ++j)
                    buffer[N_extended_right_start+j] = buffer[F/2-2 + j]; // copy from beginning of `input` to right
                for(j = 0; j < F/2-1; ++j)                                  // copy from 'buffer' to left
                    buffer[F/2-2-j] =  buffer[N_extended_right_start-j];
            } else {
                for(j = 0; j < F/2; ++j)
                    buffer[N_extended_right_start+j] = input[j%N]; // copy from beginning of `input` to right
                for(j = 0; j < F/2-1; ++j)                           // copy from 'buffer' to left
                    buffer[F/2-2-j] =  buffer[N_extended_right_start-1-j];
            }
            break;

        case MODE_SYMMETRIC:
            for(j = 0; j < N; ++j){
                buffer[F_minus_1-1-j] = input[j%N];
                buffer[N_extended_right_start+j] = input[N-1-(j%N)];
            }
            i=j;
            // use `buffer` as source
            for(; j < F_minus_1; ++j){
                buffer[F_minus_1-1-j] =  buffer[N_extended_right_start-1+i-j];
                buffer[N_extended_right_start+j] = buffer[F_minus_1+j-i];
            }
            break;

        case MODE_ASYMMETRIC:
            for(j = 0; j < N; ++j){
                buffer[F_minus_1-1-j] = input[0] - input[j%N];
                buffer[N_extended_right_start+j] = (input[N-1] - input[N-1-(j%N)]);
            }
            i=j;
            // use `buffer` as source
            for(; j < F_minus_1; ++j){
                buffer[F_minus_1-1-j] =  buffer[N_extended_right_start-1+i-j];
                buffer[N_extended_right_start+j] = buffer[F_minus_1+j-i];
            }
            break;

        case MODE_SMOOTH:
            if(N>1){
                tmp = input[0]-input[1];
                for(j = 0; j < F_minus_1; ++j)
                    buffer[j] = input[0] +    (tmp * (F_minus_1-j));
                tmp = input[N-1]-input[N-2];
                for(j = 0; j < F_minus_1; ++j)
                    buffer[N_extended_right_start+j] = input[N-1] + (tmp*j);
                break;
            }

        case MODE_CONSTANT_EDGE:
            for(j = 0; j < F_minus_1; ++j){
                buffer[j] = input[0];
                buffer[N_extended_right_start+j] = input[N-1];
            }
            break;

        case MODE_PERIODIC:
            for(j = 0; j < F_minus_1; ++j)
                buffer[N_extended_right_start+j] = input[j%N]; // copy from beginning of `input` to right

            for(j = 0; j < F_minus_1; ++j)                       // copy from 'buffer' to left
                buffer[F_minus_1-1-j] =  buffer[N_extended_right_start-1-j];
            break;

        case MODE_ZEROPAD:
        default:
            //memset(buffer, 0, sizeof($DTYPE$)*F_minus_1);
            //memset(buffer+N_extended_right_start, 0, sizeof($DTYPE$)*F_minus_1);
            //memcpy(buffer+N_extended_right_start, buffer, sizeof($DTYPE$)*F_minus_1);
            break;
    }


    ///////////////////////////////////////////////////////////////////////
    // F - N-1        - filter in input range
    // perform convolution with decimation
    for(i=start; i < stop; i+=step){                    // input elements
        sum = 0;
        for(j = 0; j < F; ++j){
            sum += buffer[i-j]*filter[j];
        }
        *(ptr_w++) = sum;
    }

    // free memory
    wtfree(buffer);
    return 0;
}

///////////////////////////////////////////////////////////////////////////////
// requires zero-filled output buffer
// output is larger than input
// performs "normal" convolution of "upsampled" input coeffs array with filter

int $DTYPE$_upsampling_convolution_full(const $DTYPE$* input, const_index_t N,
                                const $DTYPE$* filter, const_index_t F,
                                $DTYPE$* output, const_index_t O){
    register index_t i;
    register index_t j;
    $DTYPE$ *ptr_out;

    if(F<2)
        return -1;

    ptr_out = output + ((N-1) << 1);

    for(i = N-1; i >= 0; --i){
        // sliding in filter from the right (end of input)
        // i0 0  i1 0  i2 0
        //                f1 -> o1
        //             f1 f2 -> o2
        //          f1 f2 f3 -> o3

        for(j = 0; j < F; ++j)
            ptr_out[j] += input[i] * filter[j]; // input[i] - const in loop
        ptr_out -= 2;
    }
    return 0;
}

///////////////////////////////////////////////////////////////////////////////
// performs IDWT for PERIODIZATION mode only
// (refactored from the upsampling_convolution_valid_sf function)
//
// The upsampling is performed by splitting filters to even and odd elements
// and performing 2 convolutions
//
// The input data has to be periodically extended for this mode.

int $DTYPE$_upsampling_convolution_valid_sf_periodization(const $DTYPE$* input, const_index_t N,
                                    const $DTYPE$* filter, const_index_t F,
                                    $DTYPE$* output, const_index_t O)
{

    $DTYPE$ *ptr_out = output;
    $DTYPE$ *filter_even, *filter_odd;
    $DTYPE$ *periodization_buf = NULL;
    $DTYPE$ *periodization_buf_rear = NULL;
    $DTYPE$ *ptr_base;
    $DTYPE$ sum_even, sum_odd;
    index_t i, j, k, N_p = 0;
    index_t F_2 = F/2;

    if(F%2) return -3; // Filter must have even-length.

    ///////////////////////////////////////////////////////////////////////////
    // Handle special situation when input coeff data is shorter than half of
    // the filter's length. The coeff array has to be extended periodically.
    // This can be only valid for PERIODIZATION_MODE

    if(N < F_2)
    // =======
    {
        // Input data for periodization mode has to be periodically extended

        // New length for temporary input
        N_p = F_2-1 +N;

        // periodization_buf will hold periodically copied input coeffs values
        periodization_buf = wtcalloc(N_p, sizeof($DTYPE$));

        if(periodization_buf == NULL)
            return -1;

        // Copy input data to its place in the periodization_buf
        // -> [0 0 0 i1 i2 i3 0 0 0]
        k = (F_2-1)/2;
        for(i=k; i < k+N; ++i)
            periodization_buf[i] = input[(i-k)%N];

        //if(N%2)
        //    periodization_buf[i++] = input[N-1];

        // [0 0 0 i1 i2 i3 0 0 0]
        //  points here ^^
        periodization_buf_rear = periodization_buf+i-1;

        // copy cyclically () to right
        // [0 0 0 i1 i2 i3 i1 i2 ...]
        j = i-k;
        for(; i < N_p; ++i)
            periodization_buf[i] = periodization_buf[i-j];

        // copy cyclically () to left
        // [... i2 i3 i1 i2 i3 i1 i2 i3]
        j = 0;
        for(i=k-1; i >= 0; --i){
            periodization_buf[i] = periodization_buf_rear[j];
            --j;
        }

        // Now perform the valid convolution
        if(F_2%2){
            $DTYPE$_upsampling_convolution_valid_sf(periodization_buf, N_p, filter, F, output, O, MODE_ZEROPAD);

        // The F_2%2==0 case needs special result fix (oh my, another one..)
        } else {

            // Cheap result fix for short inputs
            // Memory allocation for temporary output is done.
            // Computed temporary result is copied to output*

            ptr_out = wtcalloc(idwt_buffer_length(N, F, MODE_PERIODIZATION), sizeof($DTYPE$));
            if(ptr_out == NULL){
                wtfree(periodization_buf);
                return -1;
            }

            // Convolve here as for (F_2%2) branch above
            $DTYPE$_upsampling_convolution_valid_sf(periodization_buf, N_p, filter, F, ptr_out, O, MODE_ZEROPAD);

            // rewrite result to output
            for(i=2*N-1; i > 0; --i){
                output[i] += ptr_out[i-1];
            }
            // and the first element
            output[0] += ptr_out[2*N-1];
            wtfree(ptr_out);
            // and voil`a!, ugh
        }

    } else {
    // Otherwise (N >= F_2)

        // Allocate memory for even and odd elements of the filter
        filter_even = wtmalloc(F_2 * sizeof($DTYPE$));
        filter_odd = wtmalloc(F_2 * sizeof($DTYPE$));

        if(filter_odd == NULL || filter_odd == NULL){
            if(filter_odd == NULL) wtfree(filter_odd);
            if(filter_even == NULL) wtfree(filter_even);
            return -1;
        }

        // split filter to even and odd values
        for(i = 0; i < F_2; ++i){
            filter_even[i] = filter[i << 1];
            filter_odd[i] = filter[(i << 1) + 1];
        }

        ///////////////////////////////////////////////////////////////////////////
        // This part is quite complicated and has some wild checking to get results
        // similar to those from Matlab(TM) Wavelet Toolbox

        k = F_2-1;

        // Check if extending is really needed
        N_p = F_2-1 + (index_t) ceil(k/2.); /*split filter len correct. +  extra samples*/

        // ok, if is then do:
        // 1. Allocate buffers for front and rear parts of extended input
        // 2. Copy periodically appropriate elements from input to the buffers
        // 3. Convolve front buffer, input and rear buffer with even and odd
        //    elements of the filter (this results in upsampling)
        // 4. Free memory

        if(N_p > 0){
        // =======

            // Allocate memory only for the front and rear extension parts, not the
            // whole input
            periodization_buf = wtcalloc(N_p, sizeof($DTYPE$));
            periodization_buf_rear = wtcalloc(N_p, sizeof($DTYPE$));

            // Memory checking
            if(periodization_buf == NULL || periodization_buf_rear == NULL){
                if(periodization_buf == NULL) wtfree(periodization_buf);
                if(periodization_buf_rear == NULL) wtfree(periodization_buf_rear);
                wtfree(filter_odd);
                wtfree(filter_even);
                return -1;
            }

            // Fill buffers with appropriate elements
            memcpy(periodization_buf + N_p - k, input, k * sizeof($DTYPE$));        // copy from beginning of input to end of buffer
            for(i = 1; i <= (N_p - k); ++i)                                        // kopiowanie 'cykliczne' od końca input
                periodization_buf[(N_p - k) - i] = input[N - (i%N)];

            memcpy(periodization_buf_rear, input + N - k, k * sizeof($DTYPE$));    // copy from end of input to beginning of buffer
            for(i = 0; i < (N_p - k); ++i)                                        // kopiowanie 'cykliczne' od początku input
                periodization_buf_rear[k + i] = input[i%N];

            ///////////////////////////////////////////////////////////////////
            // Convolve filters with the (front) periodization_buf and compute
            // the first part of output

            ptr_base = periodization_buf + F_2 - 1;

            if(k%2 == 1){
                sum_odd = 0;

                for(j = 0; j < F_2; ++j)
                    sum_odd += filter_odd[j] * ptr_base[-j];
                *(ptr_out++) += sum_odd;

                --k;
                if(k)
                    $DTYPE$_upsampling_convolution_valid_sf(periodization_buf + 1, N_p-1, filter, F, ptr_out, O-1, MODE_ZEROPAD);

                ptr_out += k; // k0 - 1 // really move backward by 1

            } else if(k){
                $DTYPE$_upsampling_convolution_valid_sf(periodization_buf, N_p, filter, F, ptr_out, O, MODE_ZEROPAD);
                ptr_out += k;
            }
        }

        ///////////////////////////////////////////////////////////////////////////
        // Perform _valid_ convolution (only when all filter_even and filter_odd elements
        // are in range of input data).
        //
        // This part is simple, no extra hacks, just two convolutions in one loop

        ptr_base = ($DTYPE$*)input + F_2 - 1;
        for(i = 0; i < N-(F_2-1); ++i){    // sliding over signal from left to right

            sum_even = 0;
            sum_odd = 0;

            for(j = 0; j < F_2; ++j){
                sum_even += filter_even[j] * ptr_base[i-j];
                sum_odd += filter_odd[j] * ptr_base[i-j];
            }

            *(ptr_out++) += sum_even;
            *(ptr_out++) += sum_odd;
        }
        //
        ///////////////////////////////////////////////////////////////////////////

        if(N_p > 0){
        // =======

            k = F_2-1;
            if(k%2 == 1){
                if(F/2 <= N_p - 1){ // k > 1 ?
                    $DTYPE$_upsampling_convolution_valid_sf(periodization_buf_rear , N_p-1, filter, F, ptr_out, O-1, MODE_ZEROPAD);
                }

                ptr_out += k; // move forward anyway -> see lower

                if(F_2%2 == 0){ // remaining one element
                    ptr_base = periodization_buf_rear + N_p - 1;

                    sum_even = 0;
                    for(j = 0; j < F_2; ++j){
                        sum_even += filter_even[j] * ptr_base[-j];
                    }
                    *(--ptr_out) += sum_even; // move backward first
                }
            } else {
                if(k){
                    $DTYPE$_upsampling_convolution_valid_sf(periodization_buf_rear, N_p, filter, F, ptr_out, O, MODE_ZEROPAD);
                }
            }
        }
        if(periodization_buf != NULL) wtfree(periodization_buf);
        if(periodization_buf_rear != NULL) wtfree(periodization_buf_rear);

        wtfree(filter_even);
        wtfree(filter_odd);

    }
    return 0;
}


///////////////////////////////////////////////////////////////////////////////
// performs IDWT for all modes
//
// The upsampling is performed by splitting filters to even and odd elements
// and performing 2 convolutions.
// After refactoring the PERIODIZATION mode case to separate function this
// looks much clearer now.

int $DTYPE$_upsampling_convolution_valid_sf(const $DTYPE$* input, const_index_t N,
                                    const $DTYPE$* filter, const_index_t F,
                                    $DTYPE$* output, const_index_t O,
                                    MODE mode){

    $DTYPE$ *ptr_out = output;
    $DTYPE$ *filter_even, *filter_odd;
    $DTYPE$ *ptr_base;
    $DTYPE$ sum_even, sum_odd;
    #ifdef OPT_UNROLL2
        $DTYPE$ sum_even2, sum_odd2;
    #endif
    #ifdef OPT_UNROLL4
        #ifndef OPT_UNROLL2
            $DTYPE$ sum_even2, sum_odd2;
        #endif
        $DTYPE$ sum_even3, sum_odd3;
        $DTYPE$ sum_even4, sum_odd4;
    #endif

    index_t i, j;
    index_t F_2 = F/2;

    if(mode == MODE_PERIODIZATION) // Special case
        return $DTYPE$_upsampling_convolution_valid_sf_periodization(input, N, filter, F, output, O);

    if((F%2) || (N < F_2)) // Filter must have even length.
        return -1;

    // Allocate memory for even and odd elements of the filter
    filter_even = wtmalloc(F_2 * sizeof($DTYPE$));
    filter_odd = wtmalloc(F_2 * sizeof($DTYPE$));

    if(filter_odd == NULL || filter_odd == NULL){
        if(filter_odd == NULL) wtfree(filter_odd);
        if(filter_even == NULL) wtfree(filter_even);
        return -1;
    }

    // split filter to even and odd values
    for(i = 0; i < F_2; ++i){
        filter_even[i] = filter[i << 1];
        filter_odd[i] = filter[(i << 1) + 1];
    }

    ///////////////////////////////////////////////////////////////////////////
    // Perform _valid_ convolution (only when all filter_even and filter_odd elements
    // are in range of input data).
    //
    // This part is simple, no extra hacks, just two convolutions in one loop

    ptr_base = ($DTYPE$*)input + F_2 - 1;
    i = 0;

    #ifdef OPT_UNROLL4
    // manually unroll the loop a bit
    for(; i < N-(F_2-1+8); i+=4){    // sliding over signal from left to right
        sum_even  = filter_even[0] * ptr_base[i];
        sum_even2 = filter_even[0] * ptr_base[i+1];
        sum_even3 = filter_even[0] * ptr_base[i+2];
        sum_even4 = filter_even[0] * ptr_base[i+3];

        sum_odd   = filter_odd[0] * ptr_base[i];
        sum_odd2  = filter_odd[0] * ptr_base[i+1];
        sum_odd3  = filter_odd[0] * ptr_base[i+2];
        sum_odd4  = filter_odd[0] * ptr_base[i+3];

        for(j = 1; j < F_2; ++j){
            sum_even  += filter_even[j] * ptr_base[i-j];
            sum_even2 += filter_even[j] * ptr_base[(i+1)-j];
            sum_even3 += filter_even[j] * ptr_base[(i+2)-j];
            sum_even4 += filter_even[j] * ptr_base[(i+3)-j];

            sum_odd  += filter_odd[j] * ptr_base[i-j];
            sum_odd2 += filter_odd[j] * ptr_base[(i+1)-j];
            sum_odd3 += filter_odd[j] * ptr_base[(i+2)-j];
            sum_odd4 += filter_odd[j] * ptr_base[(i+3)-j];
        }

        *(ptr_out++) += sum_even;
        *(ptr_out++) += sum_odd;
        *(ptr_out++) += sum_even2;
        *(ptr_out++) += sum_odd2;
        *(ptr_out++) += sum_even3;
        *(ptr_out++) += sum_odd3;
        *(ptr_out++) += sum_even4;
        *(ptr_out++) += sum_odd4;
    }
    #endif

    #ifdef OPT_UNROLL2
    // manually unroll the loop a bit
    for(; i < N-(F_2+1); i+=2){    // sliding over signal from left to right
        sum_even = filter_even[0] * ptr_base[i];
        sum_even2 = filter_even[0] * ptr_base[i+1];

        sum_odd  = filter_odd[0] * ptr_base[i];
        sum_odd2  = filter_odd[0] * ptr_base[i+1];

        for(j = 1; j < F_2; ++j){
            sum_even += filter_even[j] * ptr_base[i-j];
            sum_even2 += filter_even[j] * ptr_base[(i+1)-j];

            sum_odd += filter_odd[j] * ptr_base[i-j];
            sum_odd2 += filter_odd[j] * ptr_base[(i+1)-j];
        }

        *(ptr_out++) += sum_even;
        *(ptr_out++) += sum_odd;
        *(ptr_out++) += sum_even2;
        *(ptr_out++) += sum_odd2;
    }
    #endif

    for(; i < N-(F_2-1); ++i){    // sliding over signal from left to right
        sum_even = filter_even[0] * ptr_base[i];
        sum_odd  = filter_odd[0] * ptr_base[i];

        for(j = 1; j < F_2; ++j){
            sum_even += filter_even[j] * ptr_base[i-j];
            sum_odd += filter_odd[j] * ptr_base[i-j];
        }

        *(ptr_out++) += sum_even;
        *(ptr_out++) += sum_odd;
    }

    //
    ///////////////////////////////////////////////////////////////////////////

    wtfree(filter_even);
    wtfree(filter_odd);
    return 0;
}

// -> swt - todo
int $DTYPE$_upsampled_filter_convolution(const $DTYPE$* input, const_index_t N, const $DTYPE$* filter, const_index_t F, $DTYPE$* output, const_index_t step, MODE mode)
{
    return -1;
}

//## ENDFOR $DTYPE$
back to top