https://github.com/pushingthelimitsofonlineautotuning/core-configs
Raw File
Tip revision: 2fc41ae4889c7e0f39a01bd8d4ba2581698ddafb authored by Damien Couroussé on 31 March 2016, 15:58:11 UTC
initial import
Tip revision: 2fc41ae
main.tex
% !TEX TS-program = pdflatex
% !TEX encoding = UTF-8 Unicode

% This is a simple template for a LaTeX document using the "article" class.
% See "book", "report", "letter" for other types of document.

\documentclass[10pt]{article} % use larger type; default would be 10pt

\usepackage[utf8]{inputenc} % set input encoding (not needed with XeLaTeX)

%%% PAGE DIMENSIONS
\usepackage{geometry} \geometry{a4paper} 

\usepackage{graphicx} % support the \includegraphics command and options

%%% PACKAGES
\usepackage{booktabs} % for much better looking tables
\usepackage{array} % for better arrays (eg matrices) in maths
\usepackage{paralist} % very flexible & customisable lists (eg. enumerate/itemize, etc.)
\usepackage{verbatim} % adds environment for commenting out blocks of text & for better verbatim
\usepackage{subfig} % make it possible to include more than one captioned figure/table in a single float
\usepackage{threeparttable}
\usepackage{rotating}
\usepackage{multirow}
\usepackage{gensymb}
\usepackage[mode=text]{siunitx}

\newcommand{\monocol}[1]{\multicolumn{1}{#1}}

\title{Pushing the limits of Online Auto-tuning: \\ Core Configurations}
\author{Fernando Endo \and Damien Courouss\'{e} \and Henri-Pierre Charles}


\begin{document}
\maketitle





\begin{center}
\begin{sidewaystable}
\caption{Main parameters of the simulated cores.}
\label{tab:sim-params}
\begin{tabular}{|ll|c|c|c|} \hline
\multicolumn{2}{|c|}{Parameter}                                                      & Single-issue            & Dual-issue              & Triple-issue     \\ \hline
\multicolumn{2}{|l|}{Pipeline type}                                                  & IO only           & IO or OOO& IO or OOO \\ \hline
\multicolumn{2}{|l|}{Core clock}                                                     & 1.4~GHz                 & 1.6~GHz                 & 2.0~GHz          \\ \hline
\monocol{|l|}{DRAM}                         & Size/clock/latency (ns)            & 256~MB/933~MHz/81   & 256~MB/933~MHz/81   & 256~MB/933~MHz/81  \\ \hline
\monocol{|l|}{L2}                           & Size/assoc./lat./MSHRs/WBs     & 512~kB/8/3/8/16 & 1024~kB/8/5/8/16& 2048~kB/16/8/11/16 \\ \hline
\monocol{|l|}{L1-I}                         & Size/assoc./lat./MSHRs           & 32~kB/2/1/2       & 32~kB/2/1/2       & 32~kB/2/1/2      \\ \hline
\monocol{|l|}{L1-D}                         & Size/assoc./lat./MSHRs/WBs     & 32~kB/4/1/4/4   & 32~kB/4/1/5/8   & 32~kB/2/1/6/16   \\ \hline
\monocol{|l|}{Stride prefet.}               & Cache level/degree/buffer size     & 1/1/8               & 1/1/12              & 2/1/16                \\ \hline
\monocol{|l|}{\multirow{2}{*}{Branch pred.}}& Global/local history entries (bits)  & 256 (2)/-           & 4096 (2)/-          & 4096 (2)/1024 (3)  \\ \cline{2-5}
\monocol{|c|}{}                             & BTB/RAS entries                      & 256/8                 & 4096/16               & 4096/48        \\ \hline
\multicolumn{2}{|l|}{Front-end/back-end width}                                     & 1/1                   & 2/4                   & 3/7              \\ \hline
\multicolumn{2}{|l|}{INT/FP pipeline depth (+ extra OOO stages)}          & 8/10                  & 8/12 (+3)             & 9/18 (+6)            \\ \hline
\multicolumn{2}{|l|}{Physical INT/FP registers\tnote{2}}            & -                     & 82/256                & 90/256           \\ \hline
\multicolumn{2}{|l|}{ITLB/DTLB/IQ/LSQ/ROB\tnote{2}\,\, entries}                    & 32/32/16/8~each/-     & 64/64/32/12~each/40   & 128/128/48/16~each/60 \\ \hline
\monocol{|l|}{\multirow{2}{*}{INT units}}   & ALU/MUL execution ports              & 1/1                   & 2/1                   & 2/1             \\ \cline{2-5}
\monocol{|c|}{}                             & ADD/MUL cycles                       & 1/4                   & 1/4                   & 1/4             \\ \hline
\monocol{|l|}{\multirow{2}{*}{FP/SIMD}}     & Execution ports                        & 1                       & 1 or 2                  & 1, 2 or 3         \\ \cline{2-5}
\monocol{|c|}{}                             & VADD/VMUL/VMLA cycles              & 3/4/6               & 4/5/8               & 10/12/20      \\ \hline
\monocol{|l|}{\multirow{2}{*}{Load/store}}  & Execution ports                        & 1 shared                & 1 shared                & 1 for each        \\ \cline{2-5}
\monocol{|c|}{}                             & Load/store cycles                    & 1/1                   & 2/1                   & 3/2             \\ \hline
\end{tabular}
\begin{tablenotes}
    \item [1] Over-dimensioned to compensate the lack of L2-TLB.
    \item [2] For OOO only.
\end{tablenotes}
\end{sidewaystable}
\end{center}



\input{CPU-areas}


%\input{}
\begin{center}
\begin{sidewaystable}
  \caption{Execution time (seconds) of the benchmarks with the original and
  specialized reference kernels, and with the online auto-tuned and the
  best statically auto-tuned kernels, in the real platforms (all
  run-time overheads included).}
  \label{tab:hwres-all-runtimes}
  \begin{tabular}{|l|l|l|c|c|c|c|c|c|c|c|} \hline
    \monocol{|c|}{\multirow{2}{*}{Benchmark}} & \monocol{c|}{\multirow{2}{*}{Input}} & \multirow{2}{*}{Version} & \multicolumn{4}{c|}{Cortex-A8} & \multicolumn{4}{c|}{Cortex-A9} \\ \cline{4-11}
                                  &                       &      & Ref. & Spec.\ Ref.&  O-AT  & BS-AT & Ref. & Spec.\ Ref.&  O-AT  & BS-AT  \\ \hline
    \multirow{6}{*}{Streamcluster}&\multirow{2}{*}{Small} & SISD & 9.75 & 10.2       & 9.26   & 9.06  & 3.26 & 4.00       & 2.66   & 2.47   \\ \cline{3-11}
                                  &                       & SIMD & 3.84 & 3.79       & 3.74   & 3.51  & 3.33 & 2.90       & 2.51   & 2.24   \\ \cline{2-11}
                                  &\multirow{2}{*}{Medium}& SISD & 19.9 & 21.8       & 17.9   & 17.8  & 7.54 & 11.1       & 5.87   & 5.68   \\ \cline{3-11}
                                  &                       & SIMD & 7.13 & 7.05       & 6.59   & 5.93  & 9.09 & 8.86       & 5.09   & 4.84   \\ \cline{2-11}
                                  &\multirow{2}{*}{Large} & SISD & 46.8 & 46.1       & 41.0   & 40.8  & 14.8 & 14.7       & 12.0   & 11.3   \\ \cline{3-11}
                                  &                       & SIMD & 15.1 & 15.0       & 11.1   & 10.2  & 17.2 & 15.1       & 10.1   & 9.84   \\ \hline
    \multirow{6}{*}{VIPS lintra}  &\multirow{2}{*}{Small} & SISD & 0.841& 0.842      & 0.676  & 0.640 & 0.502& 0.504      & 0.456  & 0.443  \\ \cline{3-11}
                                  &                       & SIMD & 0.556& 0.563      & 0.584  & 0.510 & 0.455& 0.454      & 0.471  & 0.442  \\ \cline{2-11}
                                  &\multirow{2}{*}{Medium}& SISD & 2.30 & 2.29       & 1.76   & 1.73  & 1.47 & 1.41       & 1.37   & 1.24   \\ \cline{3-11}
                                  &                       & SIMD & 1.47 & 1.48       & 1.40   & 1.36  & 1.31 & 1.41       & 1.31   & 1.26   \\ \cline{2-11}
                                  &\multirow{2}{*}{Large} & SISD & 26.6 & 24.3       & 25.1   & 22.9  & 10.1 & 9.63       & 9.88   & 9.49   \\ \cline{3-11}
                                  &                       & SIMD & 24.7 & 24.9       & 24.0   & 22.2  & 10.4 & 10.0       & 9.94   & 9.54   \\ \hline
  \end{tabular}
  \begin{tablenotes}
    \item [] Spec.\ ref.: Reference kernel specialized as in the auto-tuned
    versions.
    \item [] O-AT: Online auto-tuned kernel.
    \item [] BS-AT: Best statically auto-tuned kernel.
  \end{tablenotes}
\end{sidewaystable}
\end{center}



\begin{sidewaystable}
\begin{center}
  \caption{Statistics of online auto-tuning in the Cortex-A8 and A9
(SISD / SIMD separated, or average if minor variations).}
  \label{tab:online-stats}
  \begin{tabular}{|l|l|c|c|c|c|c|c|c|c|c|} \hline
      \multirow{3}{*}{Bench.}  &\multirow{3}{*}{Input set}&\multirow{3}{*}{\parbox{1.4cm}{\centering{}Explo-rable versions}}& \multirow{3}{*}{\parbox{1.8cm}{\centering{}Exploration limit in one run}}&\multicolumn{7}{c|}{Run-time regeneration and space exploration} \\ \cline{5-11}
                                  &                          &       &               &\multirow{2}{*}{\parbox{1cm}{\centering{}Kernel calls}} &\multicolumn{2}{c|}{Explored} &\multicolumn{2}{c|}{Overhead to bench. run-time} &\multicolumn{2}{c|}{Duration to kernel life} \\ \cline{6-11}
                                  &                          &       &               &                        &A8 &A9 &A8            &A9             &A8           &A9     \\ \hline
    \multirow{3}{*}{\parbox{1cm}{Stream-cluster}}&Small      &390    &43-49          &\multirow{3}{*}{5315388}&49 &49 &0.2 \% (11 ms)&0.4 \% (9.2 ms)&13 / 4.4~\%  &32~\%  \\
                                  &Medium                    &510    &55-61          &                        &58 &61 &0.2 \% (17 ms)&0.3 \% (15 ms) &6.3 / 2.7~\% &22~\%  \\
                                  &Large                     &630    &67-73          &                        &67 &73 &0.2 \% (30 ms)&0.2 \% (26 ms) &5.6 / 1.8~\% &15~\%  \\ \hline
    \multirow{3}{*}{VIPS}         &Small                     &858    &106-112        &1200                    &44 &28 &4.2 \% (26 ms)&2.5 \% (12 ms) &100~\%       &100~\% \\
                                  &Medium                    &330    &39-45          &2336                    &40 &42 &0.9 \% (14 ms)&1.0 \% (14 ms) &18~\%        &66~\%  \\
                                  &Large                     &596    &73-79          &5500                    &75 &71 &0.3 \% (71 ms)&0.8 \% (78 ms) &28~\%        &86~\%  \\ \hline
    \end{tabular}
%  \begin{tablenotes}
%    \item [1] Depends on the best configuration found in the first phase.
%    \item [2] Percentage of kernel run-time during which new versions were explored, estimated through the number of active function calls.
%    \item [3] SISD / SIMD discrimination if variations are greater than a factor of two.
%  \end{tablenotes}
\end{center}
\end{sidewaystable}




\end{document}
back to top