Revision cc4814cbc080b465aebd3c9d2b1c7c41d851d7e2 authored by Clément Pernet on 14 October 2022, 15:01:33 UTC, committed by Clément Pernet on 14 October 2022, 15:01:33 UTC
1 parent 462d35d
ffpack_sss.inl
/* ffpack/ffpack_sss.inl
* Copyright (C) 2021 Hippolyte Signargout
*
* Written by Hippolyte Signargout <hippolyte.signargout@ens-lyon.fr>
*
*
* ========LICENCE========
* This file is part of the library FFLAS-FFPACK.
*
* FFLAS-FFPACK is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
* ========LICENCE========
*.
*/
#ifndef __FFLASFFPACK_ffpack_sss_inl
#define __FFLASFFPACK_ffpack_sss_inl
namespace FFPACK{
template<class Field>
inline void productSSSxTS (const Field& Fi, size_t N, size_t t, size_t s,
const typename Field::Element alpha,
typename Field::ConstElement_ptr P, size_t ldp,
typename Field::ConstElement_ptr Q, size_t ldq,
typename Field::ConstElement_ptr R, size_t ldr,
typename Field::ConstElement_ptr U, size_t ldu,
typename Field::ConstElement_ptr V, size_t ldv,
typename Field::ConstElement_ptr W, size_t ldw,
typename Field::ConstElement_ptr D, size_t ldd,
typename Field::ConstElement_ptr B, size_t ldb,
const typename Field::Element beta,
typename Field::Element_ptr C, size_t ldc)
{
/* +--------+------+------+--------+--- +--+ +--+
* | D1 | U1V2 |U1W2V3|U1W2W3V4| |B1| |C1|
* +--------+------+------+--------+--- +--+ +--+
* | P2Q1 | D2 | U2V3 | U2W3V4 | |B2| |C2|
* alpha +--------+------+------+--------+--- +--+ + beta +--+
* | P3R2Q1 | P3Q2 | D3 | U3V4 | |B3| |C3|
* +--------+------+------+--------+--- +--+ +--+
* |P4R3R2Q1|P4R3Q2| P4Q3 | D4 | |B4| |C4|
* +--------+------+------+--------+--- +--+ +--+
* | | | | | | | | | */
/* Block division */
size_t kf = N/s; // Nb of full slices of dimension s
size_t rs = N%s; //size of the partial block
size_t k = rs ? kf+1 : kf; // Total number of blocks
size_t ls = (rs)? rs: s; // Size of the last block:
/* Correspondence between matrix and table indices
* First block Last block
* D -> D_1, D + ((n - ls) * ldd) -> D_k
* P -> P_2, P + ((n - s - ls) * ldp) -> P_k
* Q -> Q_1, Q + ((n - s - ls) * ldq) -> Q_{k-1}
* R -> R_2, R + ((n - 2s - ls) * ldr) -> R_{k-1}
* U -> U_1, U + ((n - s - ls) * ldu) -> U_{k-1}
* V -> V_2, V + ((n - s - ls) * ldv) -> V_k
* W -> W_2, W + ((n - 2s - ls) * ldw) -> W_{k-1}
*/
/* Unused space when s does not divide n:
*
* | | | |
* +---+---+ +---+---+ Code readability is preferred to efficiency:
* | D | * | | | | One could store D_last and V_last in new variables
* +---+---+ | V | * |
* | | |
* +---+---+ */
/************** Diagonal Blocks ************
* C <- beta * C + alpha D B */
for (size_t block = 0; block < kf; block++) // Full blocks
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, //Element field, no transpose
s, t, s, alpha, D + block * s * ldd, //s x s by s x t, alpha DB + beta C
ldd, B + block * s * ldb, ldb, beta, // D_block is block * s rows under D
C + block * s * ldc, ldc);
if (rs) // Last block
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, //Element field, no transpose
rs, t, rs, alpha, D + kf * s * ldd, //rs x rs by rs x t, alpha DB + beta C
ldd, B + kf * s * ldb, ldb, beta, // D_kf is kf * s rows under D
C + kf * s * ldc, ldc);
/************ Lower Triangular Part ***********/
typename Field::Element_ptr Temp1 = FFLAS::fflas_new(Fi, s, t);
typename Field::Element_ptr Temp2 = FFLAS::fflas_new(Fi, s, t);
if (k > 1)
/* Temp1 <- alpha * Q_1 * B_1 */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans,
s, t, s, alpha, Q, ldq, B, ldb, Fi.zero, Temp1, t);
/* unrolling by step of 2 to avoid swapping temporaries */
size_t bs = s;
for (size_t block = 0; (block) < kf; block+=2)
{
if ((block + 2) == k)
bs = ls;
if (((block + 2) < kf) || ((kf % 2) == 0) || rs)
{
// Step a
/* C_{block + 2} += P_{block + 2} * Temp1 */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans,
bs, t, s, Fi.one, P + (block) * s * ldp,
ldp, Temp1, t, Fi.one, C + (block + 1) * s * ldc, ldc);
}
if (((block + 2) < kf) || ((rs) && (kf%2 == 0)))
{
/* Temp2 <- alpha * Q_{block + 2} * B_{block + 2} */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans,
s, t, s, alpha, Q + (block + 1) * s * ldq, ldq,
B + (block + 1) * s * ldb, ldb, Fi.zero, Temp2, t);
/* Temp2 += R_{block + 2} * Temp1 */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans,
s, t, s, Fi.one, R + (block) * s * ldr,
ldr, Temp1, t, Fi.one, Temp2, t);
if ((block + 3) == k)
bs = ls;
// Step b
/* C_{block + 3} += P_{block + 3} * Temp2 */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans,
bs, t, s, Fi.one, P + (block + 1) * s * ldp,
ldp, Temp2, t, Fi.one,
C + (block + 2) * s * ldc, ldc);
}
/* Only needs to be done if the results is useful :
* - Not last loop with instructions
*/
if (((block + 2) < kf) && (((block + 4) < kf) || (kf%2 == 0) || rs))
{
/* Temp1 <- alpha * Q_{block + 3} * B_{block + 3} */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans,
s, t, s, alpha, Q + (block + 2) * s * ldq,
ldq, B + (block + 2) * s * ldb, ldb, Fi.zero,
Temp1, t);
/* Temp1 += R_{block + 3} * Temp2 */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans,
s, t, s, Fi.one, R + (block + 1) * s * ldr,
ldr, Temp2, t, Fi.one, Temp1, t);
}
}
/*********** Upper ****************/
/* Copy, but the other way: partial blocks are taken care of first*/
/* Temp1 <- alpha * V_lastv * B_lastb */
if (kf > 1 || rs){
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, s, t, ls, alpha,
V + (k - 2) * s * ldv, ldv, B + (k - 1) * s * ldb, ldb, Fi.zero, Temp1, t);
}
/* Starting block: n - ls - s */
for (size_t block = 0; (block + 2) < k; block+=2) // Block increases but index decreases
{
/* C_{last - 1 - Block} += U_{last' - block} * Temp1
* last' = last - 1: the last block of U */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, s, t, s,
Fi.one, U + (k - 2 - block) * s * ldu, ldu, Temp1, t,
Fi.one, C + (k - 2 - block) * s * ldc, ldc);
/* Temp2 <- alpha * V_{last - 1 - block} * B_{last - 1 - block} */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans,
s, t, s, alpha, V + (k - 3 - block) * s * ldv,
ldv, B + (k - 2 - block) * s * ldb, ldb, Fi.zero, Temp2, t);
/* Temp2 += W_{last - 1 - block} * Temp1 */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans,
s, t, s, Fi.one, W + (k - 3 - block) * s * ldw,
ldw, Temp1, t, Fi.one, Temp2, t);
/* C_{last - 2 - Block} += U_{last' - 1 - block} * Temp2
* last' = last - 1 */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, s, t, s,
Fi.one, U + (k - 3 - block) * s * ldu, ldu, Temp2, t,
Fi.one, C + (k - 3 - block) * s * ldc, ldc);
/* Only needs to be done if the results is useful :
* - Not last loop
* - Odd amount of U-blocks: C was increased an even number of times at this point
* Partial blocks are taken care of at the beginning */
/* Check first condition depending on 'for' loop */
if (((block + 4) < k)||((k - 1)%2)) // next loop condition passes / (k - 1) U-blocks
{
/* Temp1 <- alpha * V_{last - 2 - block} * B_{last - 2 - block} */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans,
s, t, s, alpha, V + (k - 4 - block) * s * ldv,
ldv, B + (k - 3 - block) * s * ldb, ldb, Fi.zero, Temp1, t);
/* Temp1 += W_{last - 2 - block} * Temp2 */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans,
s, t, s, Fi.one, W + (k - 4 - block) * s * ldw,
ldw, Temp2, t, Fi.one, Temp1, t);
}
}
if ((k - 1)%2) // Odd amount of U-blocks, could be included in the for loop but not necessary
/* C_{1} += U_{1} * Temp1 */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, s, t, s,
Fi.one, U, ldu, Temp1, t, Fi.one, C, ldc);
FFLAS::fflas_delete(Temp1);
FFLAS::fflas_delete(Temp2);
}
template<class Field>
inline void SSSToDense (const Field& Fi, size_t N, size_t s,
typename Field::ConstElement_ptr P, size_t ldp,
typename Field::ConstElement_ptr Q, size_t ldq,
typename Field::ConstElement_ptr R, size_t ldr,
typename Field::ConstElement_ptr U, size_t ldu,
typename Field::ConstElement_ptr V, size_t ldv,
typename Field::ConstElement_ptr W, size_t ldw,
typename Field::ConstElement_ptr D, size_t ldd,
typename Field::Element_ptr A, size_t lda)
{
/* +--------+------+------+--------+---
* | D1 | U1V2 |U1W2V3|U1W2W3V4|
* +--------+------+------+--------+---
* | P2Q1 | D2 | U2V3 | U2W3V4 |
* A = +--------+------+------+--------+---
* | P3R2Q1 | P3Q2 | D3 | U3V4 |
* +--------+------+------+--------+---
* |P4R3R2Q1|P4R3Q2| P4Q3 | D4 |
* +--------+------+------+--------+---
* | | | | |
*/
/* Block division */
size_t kf = N/s; // Nb of full slices of dimension s
size_t rs = N%s; // Size of the partial block
size_t k = rs ? kf+1 : kf; // Total number of blocks
size_t ls = (rs)? rs: s; // Size of the last block:
/* First block Last block
* D -> D_1, D + ((n - ls) * ldd) -> D_k
* P -> P_2, P + ((n - s - ls) * ldp) -> P_k
* Q -> Q_1, Q + ((n - s - ls) * ldq) -> Q_{k-1}
* R -> R_2, R + ((n - 2s - ls) * ldr) -> R_{k-1}
* U -> U_1, U + ((n - s - ls) * ldu) -> U_{k-1}
* V -> V_2, V + ((n - s - ls) * ldv) -> V_k
* W -> W_2, W + ((n - 2s - ls) * ldw) -> W_{k-1}
*/
/* Unused space:
*
* | | | |
* +---+---+ +---+---+ Code readability is preferred to efficiency:
* | D | * | | | | One could store D_last and V_last in new variables
* +---+---+ | V | * |
* | | |
* +---+---+ */
/******************* Diagonal Blocks **************
* A <- diag (D) */
for (size_t block = 0; block < kf; block++) // Full blocks
/* Diagonal block 'block' starts on row s*block, column s*block */
FFLAS::fassign (Fi, s, s, D + block * s * ldd, ldd, A + block * s * (lda + 1), lda);
if (rs) // Last block
FFLAS::fassign (Fi, rs, rs, D + kf * s * ldd, ldd, A + kf * s * (lda + 1), lda);
/************** Lower triangular part **********************/
/* Blocks are computed row by row, by successively applying the R and Q to the P(RRR...) */
typename Field::Element_ptr Temp1 = FFLAS::fflas_new(Fi, s, s);
typename Field::Element_ptr Temp2 = FFLAS::fflas_new(Fi, s, s);
size_t bsize;
// Loop on rows which have LT part
for (size_t row = 0; row < k - 1; row++)
{
// In the last iteration, the block may not be full
bsize = (row == k - 2)? ls: s;
/* A_{row + 2, row + 1} <- P_{row + 2} * Q_{row + 1} */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans,
bsize, s, s, Fi.one, P + row * s * ldp, // In this loop, all blocks are s x s
ldp, Q + row * s * ldq, ldq, Fi.zero, A + (row + 1) * s * lda + row * s, lda);
if (row > 0) /* After row 2, R is also applied
* Next inner loop needs Temp1, which is first set here
* Temp1 <- P_{row + 2} * R_{row + 1} */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, bsize, s, s, Fi.one, P + row * s * ldp,
ldp, R + (row - 1) * s * ldr, ldr, Fi.zero, Temp1, s);
/* unrolling by step of 2 to avoid swapping temporaries */
for (size_t block = 1; block < row; block+=2)
{
/* A_{row + 2, row + 1 - block} <- Temp1 * Q_{row + 1 - block} */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, bsize, s, s, Fi.one, Temp1, s,
Q + (row - block) * s * ldq, ldq, Fi.zero,
A + (row + 1) * s * lda + (row - block) * s, lda);
/* Temp2 <- Temp1 * R_{row + 1 - block} */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, bsize, s, s, Fi.one, Temp1,
s, R + (row - 1 - block) * s * ldr, ldr, Fi.zero, Temp2, s);
/* A_{row + 2, row - block} <- Temp2 * Q_{row - block} */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, bsize, s, s, Fi.one, Temp2, s,
Q + (row - block - 1) * s * ldq, ldq, Fi.zero,
A + (row + 1) * s * lda + (row - block - 1) * s, lda);
/* If necessary:
* - Not last loop
* - One more block
* -> At least one more block */
if (block + 1 < row)
/* Temp1 <- Temp2 * R_{row - block} */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, bsize, s, s, Fi.one, Temp2,
s, R + (row - block - 2) * s * ldr, ldr, Fi.zero, Temp1, s);
}
/* First column if not done already */
if (row%2)
/* A_{row + 2, 1} <- Temp1 * Q_{1} */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, bsize, s, s, Fi.one, Temp1, s,
Q, ldq, Fi.zero, A + (row + 1) * s * lda, lda);
}
/******************* Upper triangular part *****************/
/* Symmetrically identical to the lower part (Could be merged with transposes) */
for (size_t column = 0; column < k - 1; column++) // Loop on columns which have a W
{
// In the last iteration, the block may not be full
bsize = (column == k - 2)? ls: s;
/* A_{column + 1, column + 2} <- U_{column + 1} * V_{column + 2} */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans,
s, bsize, s, Fi.one, U + column * s * ldu, // In this loop, all blocks are s x s
ldu, V + column * s * ldv, ldv, Fi.zero, A + (column) * s * lda + (column + 1) * s, lda);
/* After column 2, R is also applied */
if (column > 0)
/* Block loop needs Temp1, which is first updated here */
/* Temp1 <- W_{column + 1} * V_{column + 2} */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, s, bsize, s, Fi.one,
W + (column - 1) * s * ldw, ldw, V + column * s * ldv, ldv, Fi.zero, Temp1, s);
/* Instructions are doubled in the loop in order to avoid using more than two temporary blocks */
for (size_t block = 1; block < column; block+=2)
{
/* A_{column + 1 - block, column + 2} <- U_{column + 1 - block} * Temp1 */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, s, bsize, s, Fi.one,
U + (column - block) * s * ldu, ldu, Temp1, s, Fi.zero,
A + (column - block) * s * lda + (column + 1) * s, lda);
/* Temp2 <- W_{column + 1 - block} * Temp1 */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, s, bsize, s, Fi.one,
W + (column - 1 - block) * s * ldw, ldw, Temp1, s, Fi.zero, Temp2, s);
/* A_{column - block, column + 2} <- U_{column - block} * Temp2 */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, s, bsize, s, Fi.one,
U + (column - block - 1) * s * ldu,
ldu, Temp2, s, Fi.zero, A + (column - block - 1) * s * lda + (column + 1) * s, lda);
/* If necessary:
* - Not last loop
* - One more block
* -> At least one more block */
if (block + 1 < column)
/* Temp1 <- W_{column - block} * Temp2 */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, s, bsize, s, Fi.one,
W + (column - block - 2) * s * ldw, ldw, Temp2, s, Fi.zero, Temp1, s);
}
/* First row if not done already */
if (column%2)
/* A_{1, column + 2} <- U_{1} * Temp1 */
fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, s, bsize, s, Fi.one, U,
ldu, Temp1, s, Fi.zero, A + (column + 1) * s, lda);
}
FFLAS::fflas_delete(Temp1);
FFLAS::fflas_delete(Temp2);
}
template<class Field>
inline void DenseToSSS (const Field& Fi, size_t N, size_t s,
typename Field::Element_ptr P, size_t ldp,
typename Field::Element_ptr Q, size_t ldq,
typename Field::Element_ptr R, size_t ldr,
typename Field::Element_ptr U, size_t ldu,
typename Field::Element_ptr V, size_t ldv,
typename Field::Element_ptr W, size_t ldw,
typename Field::Element_ptr D, size_t ldd,
typename Field::ConstElement_ptr A, size_t lda)
{
/* +--------+------+------+--------+---
* | D1 | U1V2 |U1W2V3|U1W2W3V4|
* +--------+------+------+--------+---
* | P2Q1 | D2 | U2V3 | U2W3V4 |
* A <- +--------+------+------+--------+---
* | P3R2Q1 | P3Q2 | D3 | U3V4 |
* +--------+------+------+--------+---
* |P4R3R2Q1|P4R3Q2| P4Q3 | D4 |
* +--------+------+------+--------+---
* | | | | |
*/
/* Block division */
size_t kf = N/s; // Nb of full slices of dimension s
size_t rs = N%s; // Size of the partial block
size_t k = rs ? kf+1 : kf; // Total number of blocks
size_t ls = (rs)? rs: s; // Size of the last block:
/* First block Last block
* D -> D_1, D + ((n - ls) * ldd) -> D_k
* P -> P_2, P + ((n - s - ls) * ldp) -> P_k
* Q -> Q_1, Q + ((n - s - ls) * ldq) -> Q_{k-1}
* R -> R_2, R + ((n - 2s - ls) * ldr) -> R_{k-1}
* U -> U_1, U + ((n - s - ls) * ldu) -> U_{k-1}
* V -> V_2, V + ((n - s - ls) * ldv) -> V_k
* W -> W_2, W + ((n - 2s - ls) * ldw) -> W_{k-1}
*/
/* Unused space:
*
* | | | |
* +---+---+ +---+---+ Code readability is preferred to efficiency:
* | D | * | | | | One could store D_last and V_last in new variables
* +---+---+ | V | * |
* | | |
* +---+---+ */
/******************* Diagonal Blocks **************
* A <- diag (D) */
for (size_t block = 0; block < kf; block++) // Full blocks
/* Diagonal block 'block' starts on row s*block, column s*block */
FFLAS::fassign (Fi, s, s, A + block * s * (lda + 1), lda, D + block * s * ldd, ldd);
if (rs) // Last block
FFLAS::fassign (Fi, rs, rs, A + kf * s * (lda + 1), lda, D + kf * s * ldd, ldd);
if (N > s) // Otherwise we're done
{
size_t fs = s;
if (N - s < s)
{
fs = N - s;
FFLAS::fzero(Fi, s, s, U, ldu); // U, Q^T will only be updated on their fs first columns
FFLAS::fzero(Fi, s, s, Q, ldq);
}
/******************* Upper triangular part *****************/
// Temporary submatrix, copied to be pluqed
typename Field::Element_ptr H = FFLAS::fflas_new (Fi, N, N);
FFLAS::fassign (Fi, N, N, A, lda, H, N);
size_t * p = FFLAS::fflas_new<size_t> (N - ls); /* - ls instead of -s so that the size remains
greater than s */
size_t * q = FFLAS::fflas_new<size_t> (N - ls);
size_t r = FFPACK::PLUQ (Fi, FFLAS::FflasNonUnit, s, N - s, H + s, N, p, q);
// pL -> U_1
FFPACK::getTriangular(Fi, FFLAS::FflasLower, FFLAS::FflasUnit, s, fs, r, H + s,
N, U, ldu);
FFPACK::applyP (Fi, FFLAS::FflasLeft, FFLAS::FflasTrans,
fs, 0, s, U, ldu, p);
// Uq -> V_2
FFPACK::getTriangular(Fi, FFLAS::FflasUpper, FFLAS::FflasNonUnit, s, N - s, r,
H + s, N); // Remove L
FFPACK::applyP (Fi, FFLAS::FflasRight, FFLAS::FflasNoTrans, s,
0, N - s, H + s, N, q); // Apply permutation q to H
FFLAS::fassign (Fi, s, fs, H + s, N, V, ldv);
// Temporary matrix for storing lower triangular of pluq
typename Field::Element_ptr Temp = FFLAS::fflas_new (Fi, 2 * s, s);
for (size_t brow = 0; brow < k - 2; brow++)
{
r = FFPACK::PLUQ (Fi, FFLAS::FflasNonUnit, s*(2),
N - s*(brow + 2), H + N * s * brow + s * (brow + 2), N,
p, q);
// pL -> [W_{brow + 2} \\ U_{brow + 2}]
// 1) L -> Temp
FFLAS::fzero (Fi, 2 * s, s, Temp, s);
FFPACK::getTriangular(Fi, FFLAS::FflasLower, FFLAS::FflasUnit, 2*s,
N - s * (brow + 2), r,
H + s * N * brow + s * (brow + 2),
N, Temp, s, true);
// 2) p * Temp -> Temp
FFPACK::applyP (Fi, FFLAS::FflasLeft, FFLAS::FflasTrans, s,
0, 2*s, Temp, s, p);
// 3) Temp1 -> W_{brow + 2}
FFLAS::fassign (Fi, s, s, Temp, s, W + ldw * s * brow, ldw);
// 4) Temp2 -> U_{brow + 2}
FFLAS::fassign (Fi, s, s, Temp + s * s, s, U + ldu * s * (brow + 1), ldu);
// Uq -> [V_{brow + 3} & H]
FFPACK::getTriangular(Fi, FFLAS::FflasUpper, FFLAS::FflasNonUnit, s, N - s * (brow + 2),
r,
H + s * N * brow + s * (brow + 2), N,
H + s * N * (brow + 1) + s * (brow + 2), N); // Remove L
FFPACK::applyP (Fi, FFLAS::FflasRight, FFLAS::FflasNoTrans, s,
0, N - s* (brow + 2), H + s * N * (brow + 1) + s * (brow + 2), N,
q); // Apply permutation q to H
FFLAS::fassign (Fi, s, s, H + s * N * (brow + 1) + s * (brow + 2), N,
V + ldv * s * (brow + 1), ldv); /* Sould not cause any trouble even if
last block*/
}
FFLAS::fflas_delete (Temp);
/******************* Lower triangular part *****************/
// Does it need to be copied? It seems easier than to play with transposes
r = FFPACK::PLUQ (Fi, FFLAS::FflasNonUnit, N - s, s, H + s * N, N, p, q);
// Uq -> Q_1 []
FFPACK::getTriangular(Fi, FFLAS::FflasUpper, FFLAS::FflasNonUnit, fs, s, r, H + s * N,
N, Q, ldq);
FFPACK::applyP (Fi, FFLAS::FflasRight, FFLAS::FflasNoTrans, fs,
0, s, Q, ldq, q);
// pL -> [P_2 \\ H]
// Remove L
FFPACK::getTriangular(Fi, FFLAS::FflasLower, FFLAS::FflasUnit, N - s, s, r,
H + s * N, N);
// Apply permutation p to H
FFPACK::applyP (Fi, FFLAS::FflasLeft, FFLAS::FflasTrans, s,
0, N - s, H + s * N, N, p);
FFLAS::fassign (Fi, fs, s, H + s * N, N, P, ldp);
// Temporary matrix for storing upper triangular of pluq
typename Field::Element_ptr TempFlat = FFLAS::fflas_new (Fi, s, 2*s);
for (size_t brow = 0; brow < k - 2; brow++)
{
r = FFPACK::PLUQ (Fi, FFLAS::FflasNonUnit,
N - s*(brow + 2), s*(2), H + N * s * (brow + 2) + s * (brow), N,
p, q);
// Uq -> [R_{brow + 2} & Q_{brow + 2}]
// 1) U -> TempFlat
FFLAS::fzero (Fi, s, 2 * s, TempFlat, 2 * s);
FFPACK::getTriangular(Fi, FFLAS::FflasUpper, FFLAS::FflasNonUnit,
N - s * (brow + 2), 2*s, r,
H + s * N * (brow + 2) + s * (brow),
N, TempFlat, 2 * s, true);
// 2) TempFlat * q -> TempFlat
FFPACK::applyP (Fi, FFLAS::FflasRight, FFLAS::FflasNoTrans, s,
0, 2*s, TempFlat, 2 * s, q);
// 3) TempFlat_1 -> R_{brow + 2}
FFLAS::fassign (Fi, s, s, TempFlat, 2 * s, R + s * ldr * brow, ldr);
// 4) TempFlat_2 -> Q_{brow + 2}
FFLAS::fassign (Fi, s, s, TempFlat + s, 2 * s, Q + ldq * s * (brow + 1), ldq);
// pL -> [P_{brow + 3} \\ H]
// Remove L
FFPACK::getTriangular(Fi, FFLAS::FflasLower, FFLAS::FflasUnit, N - s * (brow + 2), s, r,
H + s * N * (brow + 2) + s * (brow), N,
H + s * N * (brow + 2) + s * (brow + 1), N);
// Apply permutation p to H
FFPACK::applyP (Fi, FFLAS::FflasLeft, FFLAS::FflasTrans, s,
0, N - s* (brow + 2), H + s * N * (brow + 2) + s * (brow + 1), N,
p);
// H -> P_{brow + 3}
FFLAS::fassign (Fi, ((brow == k - 3)? ls: s), s, H + s * N * (brow + 2) + s * (brow + 1), N,
P + ldp * s * (brow + 1), ldp);
}
FFLAS::fflas_delete (H, p, q, TempFlat);
}
}
}
#endif
/* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
// vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
Computing file changes ...