https://github.com/pushingthelimitsofonlineautotuning/core-configs
Tip revision: 2fc41ae4889c7e0f39a01bd8d4ba2581698ddafb authored by Damien Couroussé on 31 March 2016, 15:58:11 UTC
initial import
initial import
Tip revision: 2fc41ae
main.tex
% !TEX TS-program = pdflatex
% !TEX encoding = UTF-8 Unicode
% This is a simple template for a LaTeX document using the "article" class.
% See "book", "report", "letter" for other types of document.
\documentclass[10pt]{article} % use larger type; default would be 10pt
\usepackage[utf8]{inputenc} % set input encoding (not needed with XeLaTeX)
%%% PAGE DIMENSIONS
\usepackage{geometry} \geometry{a4paper}
\usepackage{graphicx} % support the \includegraphics command and options
%%% PACKAGES
\usepackage{booktabs} % for much better looking tables
\usepackage{array} % for better arrays (eg matrices) in maths
\usepackage{paralist} % very flexible & customisable lists (eg. enumerate/itemize, etc.)
\usepackage{verbatim} % adds environment for commenting out blocks of text & for better verbatim
\usepackage{subfig} % make it possible to include more than one captioned figure/table in a single float
\usepackage{threeparttable}
\usepackage{rotating}
\usepackage{multirow}
\usepackage{gensymb}
\usepackage[mode=text]{siunitx}
\newcommand{\monocol}[1]{\multicolumn{1}{#1}}
\title{Pushing the limits of Online Auto-tuning: \\ Core Configurations}
\author{Fernando Endo \and Damien Courouss\'{e} \and Henri-Pierre Charles}
\begin{document}
\maketitle
\begin{center}
\begin{sidewaystable}
\caption{Main parameters of the simulated cores.}
\label{tab:sim-params}
\begin{tabular}{|ll|c|c|c|} \hline
\multicolumn{2}{|c|}{Parameter} & Single-issue & Dual-issue & Triple-issue \\ \hline
\multicolumn{2}{|l|}{Pipeline type} & IO only & IO or OOO& IO or OOO \\ \hline
\multicolumn{2}{|l|}{Core clock} & 1.4~GHz & 1.6~GHz & 2.0~GHz \\ \hline
\monocol{|l|}{DRAM} & Size/clock/latency (ns) & 256~MB/933~MHz/81 & 256~MB/933~MHz/81 & 256~MB/933~MHz/81 \\ \hline
\monocol{|l|}{L2} & Size/assoc./lat./MSHRs/WBs & 512~kB/8/3/8/16 & 1024~kB/8/5/8/16& 2048~kB/16/8/11/16 \\ \hline
\monocol{|l|}{L1-I} & Size/assoc./lat./MSHRs & 32~kB/2/1/2 & 32~kB/2/1/2 & 32~kB/2/1/2 \\ \hline
\monocol{|l|}{L1-D} & Size/assoc./lat./MSHRs/WBs & 32~kB/4/1/4/4 & 32~kB/4/1/5/8 & 32~kB/2/1/6/16 \\ \hline
\monocol{|l|}{Stride prefet.} & Cache level/degree/buffer size & 1/1/8 & 1/1/12 & 2/1/16 \\ \hline
\monocol{|l|}{\multirow{2}{*}{Branch pred.}}& Global/local history entries (bits) & 256 (2)/- & 4096 (2)/- & 4096 (2)/1024 (3) \\ \cline{2-5}
\monocol{|c|}{} & BTB/RAS entries & 256/8 & 4096/16 & 4096/48 \\ \hline
\multicolumn{2}{|l|}{Front-end/back-end width} & 1/1 & 2/4 & 3/7 \\ \hline
\multicolumn{2}{|l|}{INT/FP pipeline depth (+ extra OOO stages)} & 8/10 & 8/12 (+3) & 9/18 (+6) \\ \hline
\multicolumn{2}{|l|}{Physical INT/FP registers\tnote{2}} & - & 82/256 & 90/256 \\ \hline
\multicolumn{2}{|l|}{ITLB/DTLB/IQ/LSQ/ROB\tnote{2}\,\, entries} & 32/32/16/8~each/- & 64/64/32/12~each/40 & 128/128/48/16~each/60 \\ \hline
\monocol{|l|}{\multirow{2}{*}{INT units}} & ALU/MUL execution ports & 1/1 & 2/1 & 2/1 \\ \cline{2-5}
\monocol{|c|}{} & ADD/MUL cycles & 1/4 & 1/4 & 1/4 \\ \hline
\monocol{|l|}{\multirow{2}{*}{FP/SIMD}} & Execution ports & 1 & 1 or 2 & 1, 2 or 3 \\ \cline{2-5}
\monocol{|c|}{} & VADD/VMUL/VMLA cycles & 3/4/6 & 4/5/8 & 10/12/20 \\ \hline
\monocol{|l|}{\multirow{2}{*}{Load/store}} & Execution ports & 1 shared & 1 shared & 1 for each \\ \cline{2-5}
\monocol{|c|}{} & Load/store cycles & 1/1 & 2/1 & 3/2 \\ \hline
\end{tabular}
\begin{tablenotes}
\item [1] Over-dimensioned to compensate the lack of L2-TLB.
\item [2] For OOO only.
\end{tablenotes}
\end{sidewaystable}
\end{center}
\input{CPU-areas}
%\input{}
\begin{center}
\begin{sidewaystable}
\caption{Execution time (seconds) of the benchmarks with the original and
specialized reference kernels, and with the online auto-tuned and the
best statically auto-tuned kernels, in the real platforms (all
run-time overheads included).}
\label{tab:hwres-all-runtimes}
\begin{tabular}{|l|l|l|c|c|c|c|c|c|c|c|} \hline
\monocol{|c|}{\multirow{2}{*}{Benchmark}} & \monocol{c|}{\multirow{2}{*}{Input}} & \multirow{2}{*}{Version} & \multicolumn{4}{c|}{Cortex-A8} & \multicolumn{4}{c|}{Cortex-A9} \\ \cline{4-11}
& & & Ref. & Spec.\ Ref.& O-AT & BS-AT & Ref. & Spec.\ Ref.& O-AT & BS-AT \\ \hline
\multirow{6}{*}{Streamcluster}&\multirow{2}{*}{Small} & SISD & 9.75 & 10.2 & 9.26 & 9.06 & 3.26 & 4.00 & 2.66 & 2.47 \\ \cline{3-11}
& & SIMD & 3.84 & 3.79 & 3.74 & 3.51 & 3.33 & 2.90 & 2.51 & 2.24 \\ \cline{2-11}
&\multirow{2}{*}{Medium}& SISD & 19.9 & 21.8 & 17.9 & 17.8 & 7.54 & 11.1 & 5.87 & 5.68 \\ \cline{3-11}
& & SIMD & 7.13 & 7.05 & 6.59 & 5.93 & 9.09 & 8.86 & 5.09 & 4.84 \\ \cline{2-11}
&\multirow{2}{*}{Large} & SISD & 46.8 & 46.1 & 41.0 & 40.8 & 14.8 & 14.7 & 12.0 & 11.3 \\ \cline{3-11}
& & SIMD & 15.1 & 15.0 & 11.1 & 10.2 & 17.2 & 15.1 & 10.1 & 9.84 \\ \hline
\multirow{6}{*}{VIPS lintra} &\multirow{2}{*}{Small} & SISD & 0.841& 0.842 & 0.676 & 0.640 & 0.502& 0.504 & 0.456 & 0.443 \\ \cline{3-11}
& & SIMD & 0.556& 0.563 & 0.584 & 0.510 & 0.455& 0.454 & 0.471 & 0.442 \\ \cline{2-11}
&\multirow{2}{*}{Medium}& SISD & 2.30 & 2.29 & 1.76 & 1.73 & 1.47 & 1.41 & 1.37 & 1.24 \\ \cline{3-11}
& & SIMD & 1.47 & 1.48 & 1.40 & 1.36 & 1.31 & 1.41 & 1.31 & 1.26 \\ \cline{2-11}
&\multirow{2}{*}{Large} & SISD & 26.6 & 24.3 & 25.1 & 22.9 & 10.1 & 9.63 & 9.88 & 9.49 \\ \cline{3-11}
& & SIMD & 24.7 & 24.9 & 24.0 & 22.2 & 10.4 & 10.0 & 9.94 & 9.54 \\ \hline
\end{tabular}
\begin{tablenotes}
\item [] Spec.\ ref.: Reference kernel specialized as in the auto-tuned
versions.
\item [] O-AT: Online auto-tuned kernel.
\item [] BS-AT: Best statically auto-tuned kernel.
\end{tablenotes}
\end{sidewaystable}
\end{center}
\begin{sidewaystable}
\begin{center}
\caption{Statistics of online auto-tuning in the Cortex-A8 and A9
(SISD / SIMD separated, or average if minor variations).}
\label{tab:online-stats}
\begin{tabular}{|l|l|c|c|c|c|c|c|c|c|c|} \hline
\multirow{3}{*}{Bench.} &\multirow{3}{*}{Input set}&\multirow{3}{*}{\parbox{1.4cm}{\centering{}Explo-rable versions}}& \multirow{3}{*}{\parbox{1.8cm}{\centering{}Exploration limit in one run}}&\multicolumn{7}{c|}{Run-time regeneration and space exploration} \\ \cline{5-11}
& & & &\multirow{2}{*}{\parbox{1cm}{\centering{}Kernel calls}} &\multicolumn{2}{c|}{Explored} &\multicolumn{2}{c|}{Overhead to bench. run-time} &\multicolumn{2}{c|}{Duration to kernel life} \\ \cline{6-11}
& & & & &A8 &A9 &A8 &A9 &A8 &A9 \\ \hline
\multirow{3}{*}{\parbox{1cm}{Stream-cluster}}&Small &390 &43-49 &\multirow{3}{*}{5315388}&49 &49 &0.2 \% (11 ms)&0.4 \% (9.2 ms)&13 / 4.4~\% &32~\% \\
&Medium &510 &55-61 & &58 &61 &0.2 \% (17 ms)&0.3 \% (15 ms) &6.3 / 2.7~\% &22~\% \\
&Large &630 &67-73 & &67 &73 &0.2 \% (30 ms)&0.2 \% (26 ms) &5.6 / 1.8~\% &15~\% \\ \hline
\multirow{3}{*}{VIPS} &Small &858 &106-112 &1200 &44 &28 &4.2 \% (26 ms)&2.5 \% (12 ms) &100~\% &100~\% \\
&Medium &330 &39-45 &2336 &40 &42 &0.9 \% (14 ms)&1.0 \% (14 ms) &18~\% &66~\% \\
&Large &596 &73-79 &5500 &75 &71 &0.3 \% (71 ms)&0.8 \% (78 ms) &28~\% &86~\% \\ \hline
\end{tabular}
% \begin{tablenotes}
% \item [1] Depends on the best configuration found in the first phase.
% \item [2] Percentage of kernel run-time during which new versions were explored, estimated through the number of active function calls.
% \item [3] SISD / SIMD discrimination if variations are greater than a factor of two.
% \end{tablenotes}
\end{center}
\end{sidewaystable}
\end{document}