Content - 46fe7728fd435218b3dd618c26aa606c9343ed9a - 85642a2/ocamlp3l-2.03/doc/UserManual.htm

visit type:
https://hal.archives-ouvertes.fr/hal-02487579

25 February 2020, 14:57:00 UTC
Tip revision: b34a952aad1b143e60e302df55a4f986ab904c42 authored by Software Heritage on 23 January 2007, 00:00:00 UTC
hal: Deposit 433 in collection hal
Tip revision: b34a952
UserManual.htm
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
            "http://www.w3.org/TR/REC-html40/loose.dtd">
<HTML>
<HEAD>
<TITLE> : User Manual
</TITLE>

<META http-equiv="Content-Type" content="text/html; charset=US-ASCII">
<META name="GENERATOR" content="hevea 1.09">
<STYLE type="text/css">
.li-itemize{margin:1ex 0ex;}
.li-enumerate{margin:1ex 0ex;}
.dd-description{margin:0ex 0ex 1ex 4ex;}
.dt-description{margin:0ex;}
.toc{list-style:none;}
.thefootnotes{text-align:left;margin:0ex;}
.dt-thefootnotes{margin:0em;}
.dd-thefootnotes{margin:0em 0em 0em 2em;}
.footnoterule{margin:1em auto 1em 0px;width:50%;}
.caption{padding-left:2ex; padding-right:2ex; margin-left:auto; margin-right:auto}
.title{margin:auto;text-align:center}
.center{text-align:center;margin-left:auto;margin-right:auto;}
.flushleft{text-align:left;margin-left:0ex;margin-right:auto;}
.flushright{text-align:right;margin-left:auto;margin-right:0ex;}
DIV TABLE{margin-left:inherit;margin-right:inherit;}
PRE{text-align:left;margin-left:0ex;margin-right:auto;}
BLOCKQUOTE{margin-left:4ex;margin-right:4ex;text-align:left;}
TD P{margin:0px;}
.boxed{border:1px solid black}
.textboxed{border:1px solid black}
.vbar{border:none;width:2px;background-color:black;}
.hbar{border:none;height:2px;width:100%;background-color:black;}
.hfill{border:none;height:1px;width:200%;background-color:black;}
.vdisplay{border-collapse:separate;border-spacing:2px;width:auto; empty-cells:show; border:2px solid red;}
.vdcell{white-space:nowrap;padding:0px;width:auto; border:2px solid green;}
.display{border-collapse:separate;border-spacing:2px;width:auto; border:none;}
.dcell{white-space:nowrap;padding:0px;width:auto; border:none;}
.dcenter{margin:0ex auto;}
.vdcenter{border:solid #FF8000 2px; margin:0ex auto;}
.minipage{text-align:left; margin-left:0em; margin-right:auto;}
.marginpar{border:solid thin black; width:20%; text-align:left;}
.marginparleft{float:left; margin-left:0ex; margin-right:1ex;}
.marginparright{float:right; margin-left:1ex; margin-right:0ex;}
.theorem{text-align:left;margin:1ex auto 1ex 0ex;}
.part{margin:auto;text-align:center}
</STYLE>
</HEAD>
<BODY >
<!--HEVEA command line is: hevea -entities -fix Includes/macros.hva -o UserManual.htm UserManual.tex -->
<!--CUT DEF chapter 1 --><TABLE CLASS="title"><TR><TD><H1 CLASS="titlemain"><FONT COLOR=purple>OcamlP3l</FONT> <TT>2.0</TT>: User Manual</H1><H3 CLASS="titlerest">Roberto Di Cosmo, Zheng Li<SUP><A NAME="text1" HREF="#note1">1</A></SUP>
&#XA0;&#XA0;Marco Danelutto, Susanna Pelagatti<SUP><A NAME="text2" HREF="#note2">2</A></SUP>
&#XA0;&#XA0;Xavier Leroy, Pierre Weis<SUP><A NAME="text3" HREF="#note3">3</A></SUP></H3></TD></TR>
</TABLE><!--TOC chapter Contents-->
<H1 CLASS="chapter"><!--SEC ANCHOR -->Contents</H1><!--SEC END --><UL CLASS="toc"><LI CLASS="li-toc">
<A HREF="#htoc1">Chapter&#XA0;1&#XA0;&#XA0;Skeleton based programming and <FONT COLOR=purple>OcamlP3l</FONT></A>
<UL CLASS="toc"><LI CLASS="li-toc">
<A HREF="#htoc2">1.1&#XA0;&#XA0;The system design goals</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc3">1.2&#XA0;&#XA0;The skeleton model of <FONT COLOR=purple>OcamlP3l</FONT> <TT>2.0</TT></A>
<UL CLASS="toc"><LI CLASS="li-toc">
<A HREF="#htoc4">1.2.1&#XA0;&#XA0;Parallel execution model</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc5">1.2.2&#XA0;&#XA0;Discussion: a comparision with <FONT COLOR=purple>p3l</FONT></A>
</LI><LI CLASS="li-toc"><A HREF="#htoc6">1.2.3&#XA0;&#XA0;A simple example: farming square computation</A>
</LI></UL>
</LI><LI CLASS="li-toc"><A HREF="#htoc7">1.3&#XA0;&#XA0;Skeleton syntax, semantics, and types</A>
<UL CLASS="toc"><LI CLASS="li-toc">
<A HREF="#htoc8">1.3.1&#XA0;&#XA0;On the type of skeleton combinators</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc9">1.3.2&#XA0;&#XA0;The seq skeleton</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc10">1.3.3&#XA0;&#XA0;The farm skeleton</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc11">1.3.4&#XA0;&#XA0;The pipeline skeleton</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc12">1.3.5&#XA0;&#XA0;The loop skeleton</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc13">1.3.6&#XA0;&#XA0;The map skeleton</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc14">1.3.7&#XA0;&#XA0;The reduce skeleton</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc15">1.3.8&#XA0;&#XA0;The parfun skeleton</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc16">1.3.9&#XA0;&#XA0;The pardo skeleton: a parallel scope delimiter</A>
</LI></UL>
</LI><LI CLASS="li-toc"><A HREF="#htoc17">1.4&#XA0;&#XA0;Load balancing: the colors</A>
</LI></UL>
</LI><LI CLASS="li-toc"><A HREF="#htoc18">Chapter&#XA0;2&#XA0;&#XA0;Running your <FONT COLOR=purple>OcamlP3l</FONT> program</A>
<UL CLASS="toc"><LI CLASS="li-toc">
<A HREF="#htoc19">2.1&#XA0;&#XA0;The Mandelbrot example program</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc20">2.2&#XA0;&#XA0;Sequential execution</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc21">2.3&#XA0;&#XA0;Graphical execution</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc22">2.4&#XA0;&#XA0;Parallel execution</A>
<UL CLASS="toc"><LI CLASS="li-toc">
<A HREF="#htoc23">2.4.1&#XA0;&#XA0;Compilation for parallel execution</A>
</LI></UL>
</LI><LI CLASS="li-toc"><A HREF="#htoc24">2.5&#XA0;&#XA0;Common options</A>
<UL CLASS="toc"><LI CLASS="li-toc">
<A HREF="#htoc25">2.5.1&#XA0;&#XA0;Parallel computation overview</A>
</LI></UL>
</LI><LI CLASS="li-toc"><A HREF="#htoc26">2.6&#XA0;&#XA0;Launching the parallel computation</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc27">2.7&#XA0;&#XA0;Common errors</A>
</LI></UL>
</LI><LI CLASS="li-toc"><A HREF="#htoc28">Chapter&#XA0;3&#XA0;&#XA0;More programming examples</A>
<UL CLASS="toc"><LI CLASS="li-toc">
<A HREF="#htoc29">3.1&#XA0;&#XA0;Generating and consuming streams</A>
<UL CLASS="toc"><LI CLASS="li-toc">
<A HREF="#htoc30">3.1.1&#XA0;&#XA0;Generating streams from lists</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc31">3.1.2&#XA0;&#XA0;Generating streams from files</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc32">3.1.3&#XA0;&#XA0;Generating streams repeatedly calling a function</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc33">3.1.4&#XA0;&#XA0;Transforming streams into lists</A>
</LI></UL>
</LI><LI CLASS="li-toc"><A HREF="#htoc34">3.2&#XA0;&#XA0;Global and local definitions</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc35">3.3&#XA0;&#XA0;Managing command line: <TT>option</TT></A>
</LI><LI CLASS="li-toc"><A HREF="#htoc36">3.4&#XA0;&#XA0;Directing allocation: colors</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc37">3.5&#XA0;&#XA0;Mixing Unix processes with <FONT COLOR=purple>OcamlP3l</FONT></A>
</LI></UL>
</LI><LI CLASS="li-toc"><A HREF="#htoc38">Chapter&#XA0;4&#XA0;&#XA0;Implementing <FONT COLOR=purple>OcamlP3l</FONT></A>
<UL CLASS="toc"><LI CLASS="li-toc">
<A HREF="#htoc39">4.1&#XA0;&#XA0;Closure passing as distributed higher order
parameterization</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc40">4.2&#XA0;&#XA0;Communication and process support</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc41">4.3&#XA0;&#XA0;Template implementation</A>
</LI></UL>
</LI><LI CLASS="li-toc"><A HREF="#htoc42">Chapter&#XA0;5&#XA0;&#XA0;Multivariant semantics and logical debugging</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc43">Chapter&#XA0;6&#XA0;&#XA0;Related work, conclusions and perspectives</A>
<UL CLASS="toc"><LI CLASS="li-toc">
<A HREF="#htoc44">6.1&#XA0;&#XA0;Related work</A>
</LI><LI CLASS="li-toc"><A HREF="#htoc45">6.2&#XA0;&#XA0;Conclusions and perspectives</A>
</LI></UL>
</LI></UL><BLOCKQUOTE CLASS="quote"><B>Abstract: </B>
Writing parallel programs is not easy, and debugging them is usually
a nightmare. To cope with these difficulties, a structured approach
to parallel programs using skeletons and template based compiler
techniques has been developed over the past years by several
researchers, including the <FONT COLOR=purple>p3l</FONT> group in Pisa.<P>This approach is based on the use of a set of predefined patterns for
parallel computation which are
really just functionals implemented via templates exploiting the
underlying parallelism, so it is natural to ask whether marrying a
real functional language like <FONT COLOR=purple>Ocaml</FONT> with the <FONT COLOR=purple>p3l</FONT> skeletons can be
the basis of a powerful parallel programming environment.</P><P>The <FONT COLOR=purple>OcamlP3l</FONT> prototype described in this document shows that
this is the case. The prototype, written entirely
in <FONT COLOR=purple>Ocaml</FONT> using a limited form of closure passing, allows a very
simple and clean programming style, shows real speed-up over a
network of workstations and as an added fundamental bonus allows
logical debugging of a user parallel program in a sequential
framework without changing the user code.
</P></BLOCKQUOTE><!--TOC chapter Skeleton based programming and <FONT COLOR=purple>OcamlP3l</FONT>-->
<H1 CLASS="chapter"><!--SEC ANCHOR --><A NAME="htoc1">Chapter&#XA0;1</A>&#XA0;&#XA0;Skeleton based programming and <FONT COLOR=purple>OcamlP3l</FONT></H1><!--SEC END --><P>
In a skeleton based parallel programming model 
[<A HREF="#cole-th"><CITE>6</CITE></A><CITE>, </CITE><A HREF="#ic-parle-93-1"><CITE>11</CITE></A><CITE>, </CITE><A HREF="#fgcs-firenze"><CITE>9</CITE></A>] a set of <I>skeletons</I>,
i.e. of second order functionals modeling common parallelism
exploitation patterns are provided to the user/programmer. The
programmer uses skeletons to give parallel structure to
an application and uses a plain sequential language to
express the sequential portions of the parallel application.
He/she has no other way to express
parallel activities but skeletons: no explicit process creation,
scheduling, termination, no communication primitives, no shared
memory, no notion of being executing a program onto a parallel
architecture at all.</P><P><FONT COLOR=purple>OcamlP3l</FONT> is a programming environment that allows to write parallel
programs in <FONT COLOR=purple>Ocaml</FONT><SUP><A NAME="text4" HREF="#note4">1</A></SUP> according to a skeleton model
derived by the one of <FONT COLOR=purple>p3l</FONT><SUP><A NAME="text5" HREF="#note5">2</A></SUP>, provides
seamless integration of parallel programming and functional
programming and advanced features like sequential logical debugging
(i.e. functional debugging of a parallel program via execution of the
architecture at all parallel code onto a
sequential machine) of parallel programs and strong typing, useful
both in teaching parallel programming and in building of
full-scale applications<SUP><A NAME="text6" HREF="#note6">3</A></SUP>.</P><P>In this chapter, we will first discuss the goals of our system
design, then recall the basic notions of the skeleton
model for structured parallel programming and describe the
skeleton model provided by <FONT COLOR=purple>OcamlP3l</FONT>, providing an informal
sequential (functional) and parallel semantics.
It will
be then time to describe how an <FONT COLOR=purple>OcamlP3l</FONT> program can be compiled
and run on your system (Chapter&#XA0;<A HREF="#cap:run">2</A>). Then, we discuss more
<FONT COLOR=purple>OcamlP3l</FONT> examples (Chapter&#XA0;<A HREF="#cap:exe">3</A>) and detail
<FONT COLOR=purple>OcamlP3l</FONT> implementation (Chapter&#XA0;<A HREF="#cap:implementation">4</A>)
describing how we achieved our goals using to our
advantage the flexibility of the <FONT COLOR=purple>Ocaml</FONT> system.</P><!--TOC section The system design goals-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc2">1.1</A>&#XA0;&#XA0;The system design goals</H2><!--SEC END --><P>We started the developmentof the <FONT COLOR=purple>OcamlP3l</FONT> in 1998.
At that time, the main goal of the project was to test the possibility to integrate
parallel programming in a functional language using the skeleton
model: after all, as we will see later, skeletons are just
functions, so a functional language should provide the natural setting
for them. We also wanted to preserve the elegance and flexibility of
the functional model, and the strong type system
that comes with <FONT COLOR=purple>Ocaml</FONT>. These goals were acheved in the first version of
<FONT COLOR=purple>OcamlP3l</FONT>.</P><P>But during the implementation of the system, it turned out that we
could get more than that: in our implementation, the sequential
semantics that is traditionally used to describe the functional
behaviour of the skeletons could actually be used to provide an
elementary library allowing to execute the user code in a sequential
mode, without modifying the user code. This is a
major advantage of the approach: in our system, the user can easily
debug the logic of his program running it with the sequential
semantics on a sequential machine using all the traditional techniques
(including tracing and step by step execution which are of no
practical use on parallel systems), and when the program is logically
correct he/she is guaranteed (assuming the runtime we provide is correct)
to obtain a correct parallel execution. Although a similar approach
has been taken in other skeleton based programming models, by
using the <FONT COLOR=purple>Ocaml</FONT> programming environment this result happens to be
particularly easy to achieve. This is definitely not the case of
programs written using a sequential language and directly calling
communication libraries/primitives such as the Unix socket interface
or the MPI or PVM libraries, as the logic of the program is
inextricably intermingled with low level information on
data exchange and process handling.<BR>

Following this same idea (no changes to the user code, only different
semantics for the very same skeletons), we also provided a &#X201C;graphical
semantics&#X201D; that produces a picture of the process network used during
the parallel execution of the user program.<BR>

Finally, we wanted a simple way to generate (from the user source code)
the various executables to be run on the different nodes of a parallel
machine: here the high level of abstraction provided by functional
programming, coupled with the ability to send closures over a channel
among copies of the same program provided the key to an elementary and
robust runtime system that consists of a
very limited number of lines of code.<BR>

But let's first of all introduce the skeleton model of <FONT COLOR=purple>OcamlP3l</FONT>
<TT>2.0</TT>.</P><!--TOC section The skeleton model of <FONT COLOR=purple>OcamlP3l</FONT> <TT>2.0</TT>-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc3">1.2</A>&#XA0;&#XA0;The skeleton model of <FONT COLOR=purple>OcamlP3l</FONT> <TT>2.0</TT></H2><!--SEC END --><P>
A skeleton parallel programming model supports so-called `structured parallel
programming' [<A HREF="#cole-th"><CITE>6</CITE></A><CITE>, </CITE><A HREF="#ic-parle-93-1"><CITE>11</CITE></A><CITE>, </CITE><A HREF="#fgcs-firenze"><CITE>9</CITE></A>]. Using such a
model, the parallel structure/behaviour of any application has to be
expressed by using <EM>skeletons</EM> picked up out of a collection of
predefined ones, possibly in a nested way. Each skeleton models
a typical <I>pattern</I> of parallel computation (or
<I>form</I> of parallelism) and it is parametric in the
computation performed in parallel. As an example, pipeline and farm
have been often included in skeleton collections. A
<I>pipeline</I><A NAME="@default0"></A> just models the execution of a number of
computations (stages) in cascade over a stream of input data items. Therefore,
the pipeline skeleton models all those computations where a function
<I>f</I><SUB><I>n</I></SUB>(<I>f</I><SUB><I>n</I>&#X2212;1</SUB>(&#X2026; (<I>f</I><SUB>2</SUB>(<I>f</I><SUB>1</SUB>(<I>x</I>))) &#X2026;)) has to be computed (the
<I>f</I><SUB><I>i</I></SUB> being the functions computed in cascade). A <I>farm</I><A NAME="@default1"></A> models
the execution of a given function in parallel over a stream of input data
items. Therefore, farms model all those
computations where a function <I>f</I>(<I>x</I>) has to be computed independently 
over <I>n</I> input data items in parallel.</P><P>In a skeleton model, a programmer must select the proper skeletons to program
his/her application leaving all the implementation/optimization to the
compiler/support.
This means, for instance, that the programmer has no responsibility
in deriving code for creating parallel processes, mapping and
scheduling processes on target hardware, establishing communication
frameworks (channels, shared memory locations, etc) or performing
actual interprocess communications. All these activities, needed in
order to implement the skeleton application code onto the target
hardware are completely in charge to the compile/run time support of
the skeleton programming environment. In some cases, the support
also computes some parameters such as the parallelism
degree or the communication grain needed to optimize the execution of
the skeleton program onto the target hardware
[<A HREF="#tesi-susanna"><CITE>19</CITE></A><CITE>, </CITE><A HREF="#orlando-grosso"><CITE>2</CITE></A><CITE>, </CITE><A HREF="#libro-susi"><CITE>20</CITE></A>]. </P><P>In the years, the skeleton model supplied by <FONT COLOR=purple>OcamlP3l</FONT> has evolved. Current
<FONT COLOR=purple>OcamlP3l</FONT> version (<TT>2.0</TT>) supplies 
three kinds of skeletons:
</P><UL CLASS="itemize"><LI CLASS="li-itemize">
<I>task parallel</I> skeletons<A NAME="@default2"></A>, modeling parallelism
exploited between <I>independent</I> processing activities
relative to different input data. In this set we have: pipe
(cf.&#XA0;<A HREF="#sec:pipe">1.3.4</A>) and farm (cf.&#XA0;<A HREF="#sec:farm">1.3.3</A>), whose semantics has
already 
been informally described above. Such
skeletons correspond to the usual task parallel skeletons
appearing both in <FONT COLOR=purple>p3l</FONT> and in other skeleton models
[<A HREF="#cole-th"><CITE>6</CITE></A><CITE>, </CITE><A HREF="#ic-parle-93-1"><CITE>11</CITE></A><CITE>, </CITE><A HREF="#darli-to-1"><CITE>13</CITE></A>].
</LI><LI CLASS="li-itemize"><I>data parallel</I> skeletons<A NAME="@default3"></A>, modeling parallelism exploited
computing different parts of the same input
data. In this set, we provide <CODE>mapvector</CODE> (cf.&#XA0;<A HREF="#sec:map">1.3.6</A>) and
<CODE>reducevector</CODE> (cf.&#XA0;<A HREF="#sec:red">1.3.7</A>). 
Such skeletons are not as powerful as the
<CODE>map</CODE> and <CODE>reduce</CODE> skeletons of <FONT COLOR=purple>p3l</FONT>. Instead, they closely resemble
the map&#XA0;(*) and reduce&#XA0;(/) functionals of the Bird-Meertens formalism
discussed in [<A HREF="#bird1"><CITE>3</CITE></A>] and the <FONT COLOR=purple>map</FONT> and <FONT COLOR=purple>fold</FONT> skeletons in SCL
[<A HREF="#darli-to-1"><CITE>13</CITE></A>]. The <TT>mapvector</TT> skeleton models the parallel application of
a generic function <I>f</I> to all the items of a vector data structure,
whereas the <CODE>reducevector</CODE> skeleton models a parallel computation folding all
the elements of a vector with a commutative and associative binary
operator &#X2295;). 
</LI><LI CLASS="li-itemize"><I>service</I> or <I>control</I> skeletons<A NAME="@default4"></A><A NAME="@default5"></A>, which are not
parallel <I>per se</I>. Service skeletons are used to
encapsulate <FONT COLOR=purple>Ocaml</FONT> non-parallel code to be used within other
skeletons (seq skeleton (cf.&#XA0;<A HREF="#sec:seq">1.3.2</A>)), to iterate the execution
of skeletons (loop skeleton (cf.&#XA0;<A HREF="#sec:loop">1.3.5</A>)), to transform a process
network defined using skeletons in a valid <FONT COLOR=purple>Ocaml</FONT> function (parfun skeleton
(cf.&#XA0;<A HREF="#sec:parfun">1.3.8</A>)) and to define global application structure (pardo
skeleton (cf.&#XA0;<A HREF="#sec:pardo">1.3.9</A>)). 
</LI></UL><P>As an example, consider an application whose initial and final phase cannot be
parallelized, while the behavior in the central part
is clearly divided in two consecutive
phases (stages) working on a stream of data. 
This can be modeled by the combination of <FONT COLOR=purple>OcamlP3l</FONT> skeletons in
Fig.&#XA0;<A HREF="#fig:struct1">1.1</A>.
</P><BLOCKQUOTE CLASS="figure"><DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV>
<DIV CLASS="center">
<IMG SRC="UserManual001.gif">
</DIV>
<DIV CLASS="caption"><TABLE CELLSPACING=6 CELLPADDING=0><TR><TD VALIGN=top ALIGN=left>Figure 1.1: Structure of an example <FONT COLOR=purple>OcamlP3l</FONT> application: (a) the skeleton
nesting, (b) processes participating to the implementation: <TT>pardo</TT>,
<TT>stage1</TT> and <TT>stage2</TT>.<A NAME="fig:struct1"></A></TD></TR>
</TABLE></DIV>
<DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV></BLOCKQUOTE><P>All the structure is encapsulated in a <TT>pardo</TT>
skeleton. <TT>initsec</TT> and <TT>finsec</TT> are two sequential <FONT COLOR=purple>Ocaml</FONT>
functions describing the initial and final parts of application. The central
part describes a parallel computation structured as a pipeline built
out of two stages. If both stages are implemented via a sequential function
(<TT>seq</TT>) data will flow as shown in Fig.&#XA0;<A HREF="#fig:struct1">1.1</A>.(<I>a</I>). In
particular, the 
implementation spawns three processes: a `pardo' process (executing the
sequential parts) and a network of two
processes 
implementing the pipeline (Fig.&#XA0;<A HREF="#fig:struct1">1.1</A>.(<I>b</I>)).</P><BLOCKQUOTE CLASS="figure"><DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV>
<DIV CLASS="center">
<IMG SRC="UserManual002.gif">
</DIV>
<DIV CLASS="caption"><TABLE CELLSPACING=6 CELLPADDING=0><TR><TD VALIGN=top ALIGN=left>Figure 1.2: Further parallelizing the example <FONT COLOR=purple>OcamlP3l</FONT> application: (a) the
skeleton 
nesting, (b) the processes participating to the implementation.<A NAME="fig:struct2"></A></TD></TR>
</TABLE></DIV>
<DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV></BLOCKQUOTE><P>
Now, suppose the programmer recognizes that
the <TT>initsec</TT> is computationally intensive and can be decomposed in a
sequential part (<TT>initsec</TT>) and a parallel part which applies a
function <TT>work</TT>
independently on 
each element on a stream of input data.
In this case, we can use a farm skeleton to have a pool of replicas of
<TT>work</TT>. Moreover, suppose the function <TT>stage1</TT> boils down
to applying a function <TT>f</TT> to all the elements of a vector and function
<TT>stage2</TT> &#X201C;sums&#X201D; up all the elements of the resulting vector by using an
associative and commutative operator <TT>g</TT>.
In this case, the programmer can refine the skeleton structure by using the
combination in Fig.&#XA0;<A HREF="#fig:struct2">1.2</A>.
Here, the four replicas of <TT>work</TT> act on different independent
elements in the input stream, producing four stream of results (vectors) which
are merged 
before entering the pipeline. Each stage is in turn implemented in parallel
using four processes. In the first stage, each input vector is partitioned
into four blocks. Each process takes care of applying <TT>f</TT> on one of
the four blocks. Then in the second stage, each process sums up the elements
in a block of the
vector and then all the partial results are added before providing the final
result back to the pardo (and to the <CODE>finsec</CODE> function).</P><P>The first &#X201C;application outline&#X201D; (Fig.&#XA0;<A HREF="#fig:struct1">1.1</A>)
corresponds to the following (incomplete) <FONT COLOR=purple>OcamlP3l</FONT> 
code:
</P><BLOCKQUOTE CLASS="quote">
<PRE CLASS="verbatim">let initsec _ = ...;;        (* generates stream *)
let finsec x = ...;;         (* consumes stream *)
let stage1 _ x = ... ;;      
let stage2 _ x = ... ;;
(* defines pipe network *)
let pipe =  parfun (fun () -&gt; seq(stage1) ||| seq(stage2)) ;;
pardo (fun () -&gt;
  let y = pipe (initsec ()) in
    finsec y 
);;
</PRE></BLOCKQUOTE><P>
notice the use of <TT>seq</TT> skeleton to encapsulate ordinary <FONT COLOR=purple>Ocaml</FONT>
functions and the use of <TT>parfun</TT> to define the pipe network.
Here is the sketch of <FONT COLOR=purple>OcamlP3l</FONT> code for the second application outline
(Fig.&#XA0;<A HREF="#fig:struct2">1.2</A>): 
</P><BLOCKQUOTE CLASS="quote">
<PRE CLASS="verbatim">let degree = ref 4;               (* parallel degree *)
let work _ x = ..;;               (* to be farmed out *)
let f _ x = ...;;                 (* to be mapped *)
let g _ (x,y) = ...;;             (* to be reduced *)
let pstage1 = mapvector(seq(f),!degree);;
let pstage2 = reducevector(seq(g),!degree);;
let pipe = parfun (fun () -&gt; pstage1 ||| pstage2);;
let afarm = parfun (fun () -&gt; farm (seq(farm_worker),!degree));;
pardo
  (fun () -&gt;
     let y = pipe (afarm (initsec ())) in
       finsec y
  );;
</PRE></BLOCKQUOTE><P>
here <CODE>!degree</CODE> refers to the number of
parallel processes to be used in the skeleton implementation of mapvector,
reducevector and farm. This value can vary in each execution of the
application without recompiling (eg., using a configuration file).
Details on how to write and run proper <FONT COLOR=purple>OcamlP3l</FONT> programs are
given later in Chapter&#XA0;<A HREF="#cap:run">2</A>. 
In the current release, the user is supposed to explicitly give the number of
processors to be used in each farm, mapvector and reduce skeleton. In other
words the choice of the parallelism degree actually exploited in such
skeletons is up to the
programmer. It is foreseeable in a future release to ask the system to
guess optimal values depending on available resources (following the
approach of <FONT COLOR=purple>p3l</FONT> [<A HREF="#orlando-grosso"><CITE>2</CITE></A><CITE>, </CITE><A HREF="#tesi-susanna"><CITE>19</CITE></A>]), as it is
discussed in more detail below.</P><P>Applications with a parallel structure given by skeletons (such as the
ones outlined above) can be implemented by using <I>implementation
templates</I> [<A HREF="#cole-th"><CITE>6</CITE></A><CITE>, </CITE><A HREF="#tesi-susanna"><CITE>19</CITE></A>]. 
An implementation template is a known, parametric way of
exploiting the kind of parallelism modeled by a skeleton onto a
particular target architecture. As an example, a template
corresponding to the mapvector skeleton will take some input vector data, it
will split the data into chunks holding one or more data items of the
vector, schedule them to a set of &#X201C;worker&#X201D; processes computing the
map function <I>f</I> and finally collect the results and rebuild the
output vector data structure. All these operations will be
performed by some processes, using either communications or shared
memory locations for data communication. Such a template must, as its
primary goal, implement in an efficient way the mapvector skeleton and
therefore:
</P><UL CLASS="itemize"><LI CLASS="li-itemize">
it must implement any kind of mapvector function <I>f</I>, and therefore
must be parametric with respect to the input and output data types
</LI><LI CLASS="li-itemize">it must support any reasonable parallelism degree, therefore
it must work (and provide effective parallelism exploitation) when
executed on an arbitrary number of processors. 
</LI></UL><P>In <FONT COLOR=purple>OcamlP3l</FONT> <TT>2.0</TT>, the parallelism degree of
each skeleton is chosen by the programmer. In following
releases, we will explore the possibility of using <I>analytic
performance models</I> 
associated with the implementation template 
process networks to derive the parallelism degree automatically[<A HREF="#skbook02"><CITE>21</CITE></A>]. An analytic
performance model is a set of functions computing different measures of
the performance achieved by a template on the basis of a small set of
machine dependent and user code parameters.
Examples of machine dependent parameters are the cost of communication
startup and the per-byte transmission cost. Examples of user code
parameters are the mean and variance of execution time for
user-defined sequential parts of the program and the size of data
flowing between skeletons.
The models describe the template behavior as a function of the resources used
(e.g. the physical number of executors in a farm) and can be
used by the skeleton support to predict such behavior and to tune resource allocation. A more detailed description of the whole automatic
optimization process executed by a compiler using 
performance models for skeleton tuning is given in [<A HREF="#tesi-susanna"><CITE>19</CITE></A><CITE>, </CITE><A HREF="#libro-susi"><CITE>20</CITE></A><CITE>, </CITE><A HREF="#skbook02"><CITE>21</CITE></A>].</P><P><A NAME="f0"></A>
</P><!--TOC subsection Parallel execution model-->
<H3 CLASS="subsection"><!--SEC ANCHOR --><A NAME="htoc4">1.2.1</A>&#XA0;&#XA0;Parallel execution model</H3><!--SEC END --><P>
A parallel computation in <FONT COLOR=purple>OcamlP3l</FONT> is defined by three components:
</P><UL CLASS="itemize"><LI CLASS="li-itemize">
a set of plain sequential
<FONT COLOR=purple>Ocaml</FONT>
functions (CF, common functions in Figure&#XA0;<A HREF="#fig:pardo">1.3</A>), 
</LI><LI CLASS="li-itemize">some
clusters of parallel processes, each one defined by a suitable composition of skeleton combinators
enclosed in a <CODE>parfun</CODE> (SF, skeleton functions in Figure&#XA0;<A HREF="#fig:pardo">1.3</A>)
and 
</LI><LI CLASS="li-itemize">a
<CODE>pardo</CODE> application.
</LI></UL><P>Each time a <CODE>parfun(h)</CODE> functon definition is
evaluated, a corresponding
network of processes is created according to the skeleton composition in
<CODE>h</CODE>. Each network transforms a stream of independent input data &#X2026;
<I>x</I><SUB>1</SUB>, <I>x</I><SUB>0</SUB> in a 
stream of output data &#X2026; <I>h</I>(<I>x</I><SUB>1</SUB>), <I>h</I>(<I>x</I><SUB>0</SUB>) according to
<CODE>h</CODE>.</P><P>When a <CODE>pardo</CODE> is evaluated, applications of common functions boil down to
normal sequential evaluation, while applications of skeletal functions
feed arguments data to the corresponding skeletal process network and are
evaluated in parallel. </P><P>In practice, each <CODE>pardo</CODE> defines a network built out of all
the processes in skeletal networks (<CODE>parfun</CODE> defined functions) plus a
<EM>root</EM> process orchestrating 
all the computation. Both the root node and the generic nodes run in SPMD
model. Initially, the root specializes all the generic nodes sending
information on the actual process to be executed (eg., a farm dispatcher, a
farm worker, a mapvector worker etc). </P><P>Then, the root process starts executing the <CODE>pardo</CODE>. If code is sequential, it
is executed locally on the root node. Otherwise, 
if the evaluation of a <CODE>parfun</CODE> function is encountered,
the root activates evaluation passing the relevant parameters to the
correponding network. The same network can be activated many times, 
each time an evaluation of the corresponding <CODE>parfun</CODE> function is
encountered.<BR>

Notice that the execution model assumes an unlimited number of homogenous processors.
In practical situations, processors will be less than processes and have
heterogeneus capacity. The <FONT COLOR=purple>OcamlP3l</FONT> upport, possibly with some help from the programmer
(using colors, see Sec.&#XA0;<A HREF="#ss:colors">1.4</A>),
is in charge of implementing this in a transparent way.</P><BLOCKQUOTE CLASS="figure"><DIV CLASS="center"><DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV>
<IMG SRC="UserManual003.gif">
<DIV CLASS="caption"><TABLE CELLSPACING=6 CELLPADDING=0><TR><TD VALIGN=top ALIGN=left>Figure 1.3: Parallel execution model: the role of parfun and pardo</TD></TR>
</TABLE></DIV><A NAME="fig:pardo"></A>
<DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV></DIV></BLOCKQUOTE><!--TOC subsection Discussion: a comparision with <FONT COLOR=purple>p3l</FONT>-->
<H3 CLASS="subsection"><!--SEC ANCHOR --><A NAME="htoc5">1.2.2</A>&#XA0;&#XA0;Discussion: a comparision with <FONT COLOR=purple>p3l</FONT></H3><!--SEC END --><P> 
Even if <FONT COLOR=purple>OcamlP3l</FONT> skeletons are close to
original <FONT COLOR=purple>p3l</FONT> ones, 
the parallel evaluation model is completely different. Thus, for these
familiar with <FONT COLOR=purple>p3l</FONT>, it is 
interesting to highlight the main differences between two models and to give a brief account on the reasons that have lead to
such a design change.</P><P>In the original <FONT COLOR=purple>p3l</FONT> system (and, actually in initial versions of <FONT COLOR=purple>OcamlP3l</FONT>[<A HREF="#Ocamlp3lMlw98"><CITE>10</CITE></A>]), a program is clearly stratified into two
levels: there is a skeleton <EM>cap</EM>, that can be composed of an
arbitrary number of skeleton combinators, but as soon as one goes
outside this cap, passing into the sequential code through the
<TT>seq</TT> combinator, there is no way for the sequential code to
call a skeleton. To say it briefly, the entry point of a <FONT COLOR=purple>p3l</FONT>
program <EM>must</EM> be a skeleton expression, and no skeleton
expression is allowed anywhere else in the sequential code. Using current <FONT COLOR=purple>OcamlP3l</FONT>
terminology, <FONT COLOR=purple>p3l</FONT> restricts the pardo to contain one single call to a network
defind with <TT>parfun</TT>, and <EM>without</EM> calls to sequential functions.<BR>

This restriction is quite reasonable when
the goal is to build <EM>a single</EM> stream processing network
described by the skeleton cap. However, it has several drawbacks in the
general case:</P><UL CLASS="itemize"><LI CLASS="li-itemize">
it breaks uniformity, since even if the skeletons <EM>look
like</EM> ordinary functionals, they <EM>cannot</EM> be used as
ordinary functions, in particular inside sequential code,
</LI><LI CLASS="li-itemize">many applications (such as the numerical algorithms described in
[<A HREF="#clement04"><CITE>5</CITE></A>]) boil down to simple nested loops, some of
which can be easily parallelised, and some cannot; forcing the
programmer to push all the parallelism in the skeleton cap could lead
to rewriting the algorithm in a very unnatural way,
</LI><LI CLASS="li-itemize">indeed, a `parallelizable' operation
can be used at several stages in the algorithm: the <FONT COLOR=purple>p3l</FONT> skeleton
cap does not allow the user to specify that parts of the stream
processing network can be shared among different phases of the
computation, which is an essential requirement to avoid wasting
computational resources.
</LI></UL><P>To overcome all these difficulties and limitations, the
<TT>2.0</TT> version of <FONT COLOR=purple>OcamlP3l</FONT> introduces the new
<TT>parfun</TT> skeleton (not present in <FONT COLOR=purple>p3l</FONT>), the very <EM>dual</EM> of the <TT>seq</TT>
skeleton. In simple words, one can wrap a full skeleton expression
inside a <TT>parfun</TT>, and obtain a regular <FONT COLOR=purple>Ocaml</FONT> stream processing
function, usable with no limitations in any sequential piece of code:
a <TT>parfun</TT> encapsulated skeleton behaves exactly as a normal
function that receives a stream as input, and returns a stream as
output. However, in the parallel semantics, the <TT>parfun</TT>
combinator gets a parallel interpretation, so that the encapsulated
function is actually implemented as a parallel network (the network to
which the <TT>parfun</TT> combinator provides an interface).<BR>
Since many <TT>p</TT>arfun expressions may occur in an <FONT COLOR=purple>OcamlP3l</FONT>
program, there may be several disjoint parallel processing networks at
runtime. This implies that, to contrast with <FONT COLOR=purple>p3l</FONT>, the
<FONT COLOR=purple>OcamlP3l</FONT> model of computation requiers a <EM>main</EM> sequential
program (the <CODE>pardo</CODE>): this main program is responsible for information interchange
with the various <TT>parfun</TT> encapsulated networks.</P><!--TOC subsection A simple example: farming square computation-->
<H3 CLASS="subsection"><!--SEC ANCHOR --><A NAME="htoc6">1.2.3</A>&#XA0;&#XA0;A simple example: farming square computation</H3><!--SEC END --><P><A NAME="ss:example"></A>
It is now time to discuss a simple but complete <FONT COLOR=purple>OcamlP3l</FONT> program.
The program in Figure&#XA0;<A HREF="#fig:squarecode">1.4</A> uses a farm to compute a very simple function over a
stream of floats.
</P><BLOCKQUOTE CLASS="figure"><DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV>
<PRE CLASS="verbatim">(* computes x square *)
let farm_worker _ = fun x -&gt; x *. x;;

(* prints a result *)
let print_result x = print_float x; print_newline();;

let compute = parfun (fun () -&gt;  (farm (seq(farm_worker),4)));;

pardo(fun () -&gt;
  let is = P3lstream.of_list [1.0;2.0;3.0;4.0;5.0;6.0;7.0;8.0] in
  let s' = compute is in P3lstream.iter print_result s';
);;
</PRE><DIV CLASS="caption"><TABLE CELLSPACING=6 CELLPADDING=0><TR><TD VALIGN=top ALIGN=left>Figure 1.4: <FONT COLOR=purple>OcamlP3l</FONT> code using a farm to square a stream of float.<A NAME="fig:squarecode"></A></TD></TR>
</TABLE></DIV>
<DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV></BLOCKQUOTE><P>
First we have two standard <FONT COLOR=purple>Ocaml</FONT> functions: <CODE>farm_worker</CODE> which
simply computes the square of a float argument and <CODE>print_result</CODE> which
dumps results on the stardard output. Notice that
<CODE>farm_worker</CODE> takes two parameters instead of one as it would seem
reasonable. The extra parameter (<CODE>_</CODE>) is required by the <CODE>seq</CODE>
skeleton type and is used in general to provide local initialization data (for
instance, an initialization matrix, some initial seed or the 
like)<A NAME="@default6"></A><A NAME="@default7"></A>. In this simple case, initialization data are not needed and the
parameter is just ignored by <CODE>farm_worker</CODE>. This optional initialization
is provided for all <FONT COLOR=purple>OcamlP3l</FONT> skeletons (see Section&#XA0;<A HREF="#sss:unit">1.3.1</A>). 
Function <CODE>compute</CODE> uses <CODE>parfun</CODE>
to define a parallel network built by a single farm, in particular: 
</P><PRE CLASS="verbatim">seq(farm_worker)
</PRE><P>turns the sequential <CODE>farm_worker</CODE> function into a `stream processor'
applying it to a stream of input values. Then, an instance of the farm skeleton is defined with
</P><PRE CLASS="verbatim">farm (seq(farm_worker),4)
</PRE><P>which spawns four workers. Finally,
</P><PRE CLASS="verbatim">parfun (fun () -&gt;  (farm (seq(farm_worker),4)));;
</PRE><P>encapsulates the skeleton network into a standard <FONT COLOR=purple>Ocaml</FONT>
function. </P><P>The last <CODE>pardo</CODE> defines how sequential functions and parallel modules
are interconnected. In this case, we have a single parallel module
(<CODE>compute</CODE>) and two sequential parts. The first sequential part builds up the data stream
(using the standard <FONT COLOR=purple>OcamlP3l</FONT> library function 
</P><PRE CLASS="verbatim">P3lstream.of_list [1.0;2.0;3.0;4.0;5.0;6.0;7.0;8.0]
</PRE><P>which turns lists in streams) and the second part
applies <CODE>print_results</CODE> to all the elements in the stream (using standard
stream iterator
<CODE>P3lstream.iter</CODE>). The global network is shown in
Figure&#XA0;<A HREF="#fig:simplefarm">1.5</A>, where arrows point out the data flow among processes.
</P><BLOCKQUOTE CLASS="figure"><DIV CLASS="center"><DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV>
<IMG SRC="UserManual004.gif">
<DIV CLASS="caption"><TABLE CELLSPACING=6 CELLPADDING=0><TR><TD VALIGN=top ALIGN=left>Figure 1.5: Overall process network of the simple farm squaring a stream of double.</TD></TR>
</TABLE></DIV><A NAME="fig:simplefarm"></A>
<DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV></DIV></BLOCKQUOTE><!--TOC section Skeleton syntax, semantics, and types-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc7">1.3</A>&#XA0;&#XA0;Skeleton syntax, semantics, and types</H2><!--SEC END --><P> <A NAME="sec:skeletons"></A></P><P>Here we describe the syntax, the informal semantics, and the
types assigned to each skeleton combinator.</P><P>Each skeleton is a stream processor, transforming an input stream into an
output stream and is equipped with three semantics:
</P><DL CLASS="description"><DT CLASS="dt-description">
<B>sequential semantics</B></DT><DD CLASS="dd-description"> a suitable sequential <FONT COLOR=purple>Ocaml</FONT> function
transforming all the elements of the input stream;
</DD><DT CLASS="dt-description"><B>parallel semantics</B></DT><DD CLASS="dd-description"> a process network implementing the stream
transformation in
parallel;
</DD><DT CLASS="dt-description"><B>graphical semantics</B></DT><DD CLASS="dd-description"> a graphical representation of the process network
corresponding to the parallel semantics.
</DD></DL><!--TOC subsection On the type of skeleton combinators-->
<H3 CLASS="subsection"><!--SEC ANCHOR --><A NAME="htoc8">1.3.1</A>&#XA0;&#XA0;On the type of skeleton combinators</H3><!--SEC END --><P>
<A NAME="sss:unit"></A></P><P>First of all, let's explain why the actual <FONT COLOR=purple>Ocaml</FONT> types of our
skeleton combinators are a bit more complex than those used by other skeleton
systems (eg., [<A HREF="#darli-to-1"><CITE>13</CITE></A>]).
In effect, our types seem somewhat polluted by spurious
additional <CODE>unit</CODE> types, compared to the types one would expect.</P><P>For istance, consider the <CODE>seq</CODE> combinator. As informally discussed above, <CODE>seq</CODE>
encapsulates any <FONT COLOR=purple>Ocaml</FONT> function <I>f</I> into a sequential process which
applies <I>f</I> to all the inputs received in the input stream. This means that,
writing
<CODE>seq f</CODE>, any <FONT COLOR=purple>Ocaml</FONT> function with type <CODE>f : 'a -&gt; 'b</CODE>
is wrapped into a sequential process (this is reminiscent to
the <CODE>lift</CODE> combinator used in many stream processing libraries of
functional programming languages).</P><P>Hence, a strightforward type for <CODE>seq</CODE> would be</P><P><CODE>('a -&gt; 'b) -&gt; 'a stream -&gt; 'b stream</CODE>.<BR>

However, in <FONT COLOR=purple>OcamlP3l</FONT>, <CODE>seq</CODE> is declared as
</P><DIV CLASS="center">
<CODE>seq : (unit -&gt; 'a -&gt; 'b) -&gt; unit -&gt; 'a stream -&gt; 'b stream</CODE>
</DIV><P>meaning that the lifted function argument <CODE>f</CODE> gets an
extra <CODE>unit</CODE> argument. In effect, in real-world application, the
user functions may need to hold a sizeable amount of local data (e.g. some
huge matrices that have to be initialised in a numerical
application), and we decided to have a type general enough to allow 
the user to
finely describe where and when those data have to be initialized and/or
copied.</P><P>Reminiscent to partial evaluation and &#X3BB;-lifting, we reuse the
classical techniques of functional programming to initialize or
allocate data globally and/or locally to a function closure. This is
just a bit complicated here, due to the higher-order nature of the
skeleton algebra, that in turn reflects the inherent complexity of
parallel computing:</P><UL CLASS="itemize"><LI CLASS="li-itemize">
<EM>global initialization</EM>: the data is initialised once and
for all, and is then replicated in every copy of the stream processor that
a <CODE>farm</CODE>, a <CODE>mapvector</CODE> or a <CODE>reducevector</CODE> may launch; this was
already available in the previous versions of <FONT COLOR=purple>OcamlP3l</FONT>, since we
could write
<PRE>
let f =
  let localdata = do_huge_initialisation_step () in
  fun x -&gt; compute (localdata, x);;
&#X2026;
farm (seq f, 10)
</PRE></LI><LI CLASS="li-itemize"><EM>local initialization</EM>: the data is initialised by each
stream processor, <EM>after</EM> the copy has been performed by a
<CODE>farm</CODE> or a <CODE>mapvector</CODE> skeleton; this was just impossible
in the previous versions of <FONT COLOR=purple>OcamlP3l</FONT>; with unit types, it is
now easy to achieve: 
<PRE>
let f () =
  let localdata = do_huge_initialisation_step () in
  fun x -&gt; compute (localdata, x);;
&#X2026;
farm (seq f, 10)
</PRE>
when the <CODE>farm</CODE> skeleton creates 10 copies of <CODE>seq f</CODE>,
each copy is created by passing () to the <CODE>seq</CODE> combinator,
which in turn passes () to <I>f</I>, producing the allocation of a
different copy of <I>localdata</I> for each instance<SUP><A NAME="text7" HREF="#note7">4</A></SUP>.<P>Note also that the old behaviour, namely <FONT COLOR=purple>OcamlP3l</FONT> version 1.0, where a
unique initialization was
shared by all copies, is still easy (and can be freely combined to
further local initializations if needed):
</P><PRE>
let f =
  let localdata = do_huge_initialisation_step () in
  fun () -&gt; fun x -&gt; compute (localdata, x);;
&#X2026;
farm (seq f, 10)
</PRE></LI></UL><P>
To sum up, the extra <CODE>unit</CODE> parameters give the programmer the
hability to decide whether local initialisation data in his functions
are shared among all copies or not. In other words, we can regard the
skeleton combinators in the current version of <FONT COLOR=purple>OcamlP3l</FONT> as
&#X201C;delayed skeletons&#X201D;, or &#X201C;skeleton factories&#X201D;, that produce <EM>an
instance</EM> of a skeleton every time they are passed an () argument.</P><P>In the following sections we detail the types and semantics of all the skeletons
and provide some usage examples. </P><!--TOC subsection The seq skeleton-->
<H3 CLASS="subsection"><!--SEC ANCHOR --><A NAME="htoc9">1.3.2</A>&#XA0;&#XA0;The seq skeleton</H3><!--SEC END --><P> <A NAME="sec:seq"></A>
The seq skeleton encapsulates an <FONT COLOR=purple>Ocaml</FONT> function <I>f</I> into a stream process
which applies <I>f</I> to all the inputs received on the <EM>input stream</EM> and
sends off the reselts on the <EM>output stream</EM>. Any <FONT COLOR=purple>Ocaml</FONT> function with
type 
</P><PRE CLASS="verbatim">f: unit -&gt; 'a -&gt; 'b
</PRE><P>can be encapsulated in the seq skeletons as follows:
</P><PRE CLASS="verbatim">seq f
</PRE><P>The central point is that the function must be <EM>unary</EM>, i.e. functions
working on more that one argument must collect them in a single tuple before
being used in a seq. For instance, the fragment 
</P><PRE CLASS="verbatim">let g _ (x,y) = x *. y;;             
let redmul = parfun (fun () -&gt; reducevector(seq(g),6));;
</PRE><P>shows how to encapsulate a float binary operator (<TT>*.</TT>) to use it
within a reducevector with 6 working processes.</P><!--TOC subsection The farm skeleton-->
<H3 CLASS="subsection"><!--SEC ANCHOR --><A NAME="htoc10">1.3.3</A>&#XA0;&#XA0;The farm skeleton</H3><!--SEC END --><P> <A NAME="sec:farm"></A>
The farm skeleton computes in parallel a function <I>f</I> over
different data items appearing in its input stream.<BR>
 From a functional viewpoint, given
a stream of data items <I>x</I><SUB>1</SUB>, &#X2026;, <I>x</I><SUB><I>n</I></SUB>, and a function <I>f</I>, the
expression <TT>farm</TT> (<I>f</I>, <I>k</I>) computes <I>f</I>(<I>x</I><SUB>1</SUB>), &#X2026;, <I>f</I>(<I>x</I><SUB><I>n</I></SUB>).
Parallelism is gained by having <I>k</I> independent processes that
compute <I>f</I> on different items of the input stream.<BR>
 If <I>f</I> has type <TT>(unit -&gt; 'b stream -&gt; 'c stream)</TT>, and
<I>k</I> has type <TT>int</TT>, then <I>farm</I> (<I>f</I>, <I>k</I>) has type
<TT>unit -&gt; 'b stream -&gt; 'c stream</TT>.
In terms of (parallel) processes, a sequence of data appearing onto
the input stream of a farm is submitted to a set of worker processes.
Each worker applies the same function (<I>f</I>, which can be in turn difined
using parallel skeletons) to the data items
received and delivers the result onto the output stream. The resulting
process network looks like the following:
</P><DIV CLASS="center">
<IMG SRC="UserManual005.gif">
</DIV><P>
where the emitter process takes care of task-to-worker scheduling (possibly
taking into account some load balancing strategy).</P><P>The <TT>farm</TT> function takes two parameters: 
</P><UL CLASS="itemize"><LI CLASS="li-itemize">
the first denoting the skeleton expression
representing the farm worker
computation, 
</LI><LI CLASS="li-itemize">the second denoting the parallelism degree the user decided for
the farm, i.e. the number of worker processes that have to be set up
in the farm implementation.
</LI></UL><P>Figure&#XA0;<A HREF="#fig:magiclocal">1.6</A> shows an <FONT COLOR=purple>OcamlP3l</FONT> program which chooses
randomly a number from a list and writes it to the file <TT>magic_number</TT>. Notice the local
initialization of the random number generator (which takes a different seed
in each worker) and the local open of the file to be written. 
In Figure&#XA0;<A HREF="#fig:magicglobal">1.7</A> you can see how the worker code can be simply
transformed to have all the workers share the same filed descripto if needed
(global initialization)<A NAME="@default8"></A><A NAME="@default9"></A>.</P><BLOCKQUOTE CLASS="figure"><DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV>
<PRE CLASS="verbatim">let write_int = 
  function () -&gt;
    let fd = Unix.openfile "magic_number" [Unix.O_WRONLY; 
                  Unix.O_CREAT; Unix.O_TRUNC] 0o644 in
    let () = Random.self_init () in
    ( function x -&gt;
 let time_to_wait = 1 + (Random.int 3) in
        Unix.sleep(time_to_wait);
 let sx = string_of_int x in
        ignore(Unix.write fd sx 0 (String.length sx));;

let parwrite = parfun(fun () -&gt; farm(seq(write_int), 5));;

pardo( fun () -&gt;
       let the_stream = P3lstream.of_list [0;1;2;3;4] in
       parwrite the_stream
      );;
</PRE><DIV CLASS="caption"><TABLE CELLSPACING=6 CELLPADDING=0><TR><TD VALIGN=top ALIGN=left>Figure 1.6: A simple farm example (with local initialization) <A NAME="fig:magiclocal"></A></TD></TR>
</TABLE></DIV>
<DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV></BLOCKQUOTE><BLOCKQUOTE CLASS="figure"><DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV>
<PRE CLASS="verbatim">let write_int = 
  let fd = Unix.openfile "magic_number" [Unix.O_WRONLY; 
                Unix.O_CREAT; Unix.O_TRUNC] 0o644 in
    ( function () -&gt;
      let () = Random.self_init () in
 function x -&gt;
   let time_to_wait = 1 + (Random.int 3) in
     Unix.sleep(time_to_wait);
     let sx = string_of_int x in
       ignore(Unix.write fd sx 0 (String.length sx));;
</PRE><DIV CLASS="caption"><TABLE CELLSPACING=6 CELLPADDING=0><TR><TD VALIGN=top ALIGN=left>Figure 1.7: Worker code using global initialization to share file descriptor <A NAME="fig:magicglobal"></A></TD></TR>
</TABLE></DIV>
<DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV></BLOCKQUOTE><!--TOC subsection The pipeline skeleton-->
<H3 CLASS="subsection"><!--SEC ANCHOR --><A NAME="htoc11">1.3.4</A>&#XA0;&#XA0;The pipeline skeleton</H3><!--SEC END --><P><A NAME="sec:pipe"></A>
The pipeline skeleton is denoted by the infix
operator <CODE>|||</CODE>; it performs in parallel the computations relative
to different stages of a function composition over different data
items of the input stream.<BR>
 Functionally, <I>f</I><SUB>1</SUB> <CODE>|||</CODE> <I>f</I><SUB>2</SUB> &#X2026; <CODE>|||</CODE>
<I>f</I><SUB><I>n</I></SUB> computes <I>f</I><SUB><I>n</I></SUB> (&#X2026; <I>f</I><SUB>2</SUB> (<I>f</I><SUB>1</SUB> (<I>x</I><SUB><I>i</I></SUB>))&#X2026;) over all the data
items <I>x</I><SUB><I>i</I></SUB> in the input stream. Parallelism is now gained
by having <I>n</I> independent parallel processes. Each process computes
a function <I>f</I><SUB><I>i</I></SUB> over the data items produced by the process computing
<I>f</I><SUB><I>i</I>&#X2212;1</SUB> and delivers its results to the process computing
<I>f</I><SUB><I>i</I>+1</SUB>.<BR>
 If <I>f</I><SUB>1</SUB> has type <TT>(unit -&gt; 'a stream -&gt; 'b stream)</TT>,<BR>
and <I>f</I><SUB>2</SUB> has type <TT>(unit -&gt; 'b stream -&gt; 'c stream)</TT>,<BR>
then <I>f</I><SUB>1</SUB> ||| <I>f</I><SUB>2</SUB> has type <TT>unit -&gt; 'a stream -&gt; 'c stream</TT>.</P><P>In terms of (parallel) processes, a sequence of data appearing onto
the input stream of a pipe is submitted to the first pipeline stage.
This stage computes the function <I>f</I><SUB>1</SUB> onto every data item appearing
onto the input stream. Each output data item computed by the stage is
submitted to the second stage, computing the function <I>f</I><SUB>2</SUB> and so on
and so on until the output of the <I>n</I>&#X2212;1 stage is submitted to the last
stage. Eventually, the last stage delivers its own output onto the
pipeline output channel. The resulting process network looks like the
following:</P><DIV CLASS="center"><IMG SRC="UserManual006.gif">
</DIV><P>For instance, a pipeline made out of three stages, the first one
squaring integers, the second one multiplying integers by 2 and the
third one incrementing integers can be written as follows:

</P><PRE CLASS="verbatim">let square _ x = x * x;;
let double _ x = 2 * x;;
let inc    _ x = x + 1;;
let apipe = parfun (fun () -&gt; seq(square) ||| seq(double) ||| seq(inc));;
</PRE><P>
A pipeline models (parallel) function composition, thus input and output types
of stages should match. This means that 
if stage (<I>i</I>&#X2212;1) has type
<CODE>unit -&gt; 'c -&gt; 'a</CODE>
stage (<I>i</I>+1) has type
<CODE>unit -&gt; 'b -&gt; 'd</CODE>
stage <I>i</I>-th must have type 
<CODE>unit -&gt; 'a -&gt; 'b</CODE>.
</P><!--TOC subsection The loop skeleton-->
<H3 CLASS="subsection"><!--SEC ANCHOR --><A NAME="htoc12">1.3.5</A>&#XA0;&#XA0;The loop skeleton</H3><!--SEC END --><P> <A NAME="sec:loop"></A>
The loop skeleton named <TT>loop</TT>; it computes a function <I>f</I> over all the
elements of its input stream until a boolean condition <I>g</I> is verified.
A <EM>loop </EM> has type 
</P><PRE CLASS="verbatim">('a -&gt; bool) * (unit -&gt; 'a stream -&gt; 'a stream)
</PRE><P>provided that <I>f</I> has type <CODE>unit -&gt; 'a stream -&gt; 'a stream</CODE>
and <I>g</I> has type <CODE>'a -&gt; bool</CODE>.
Function <I>f</I> is computed before testing termination, thus it is applied at
least one time to each input stream element.
In terms of (parallel) processes, a sequence of data appearing onto
the input stream of a loop is submitted to a <EM>loop in</EM> stage. This
stage just merges data coming from the input channel and from the
feedback channel and delivers them to the <EM>loop body</EM> stage. The
loop body stage computes <I>f</I> and delivers results to the <EM>loop end</EM>
stage. This latter stage computes <I>g</I> and either delivers (<TT>f x</TT>) onto the
output channel (in case (<TT>g (f x)</TT>) turns out to be <TT>true</TT>) or it
delivers the value to the loop in process along the feedback channel
(<TT>(g (f x)) = false</TT>).
The resulting process network looks like the following:</P><DIV CLASS="center">
<IMG SRC="UserManual007.gif">
</DIV><P>
For instance, the following loop increments all the integer data items
in the input stream until they become divisible by 5:
</P><PRE CLASS="verbatim">let notdivbyfive x = (x mod 5 &lt;&gt; 0);;
let inc _ x = x + 1;;
let aloop  = parfun (fun () -&gt; loop(notdivbyfive,seq(inc)));;
</PRE><P>The output of this function on the sequence
</P><PRE CLASS="verbatim">3,7,10,14
</PRE><P>is <TT>5,10,15,15</TT>. In particular,
the call <TT>theloop 10</TT> returns <TT>15</TT> as the body <TT>seq(inc)</TT>
is evaluated on input data <I>before</I> the condition, and
therefore the first time the condition is evaluated on 11 and
not on 10.</P><!--TOC subsection The map skeleton-->
<H3 CLASS="subsection"><!--SEC ANCHOR --><A NAME="htoc13">1.3.6</A>&#XA0;&#XA0;The map skeleton</H3><!--SEC END --><P><A NAME="sec:map"></A>
The map skeleton is named <TT>mapvector</TT>; it
computes in parallel a function over all the data items of a vector,
generating the (new) vector of the results.<BR>
Therefore, for each vector <I>X</I> in the input data stream,
<TT>mapvector</TT> (<I>f</I>, <I>n</I>) computes the function <I>f</I> over all the
items of <I>X</I>=[<I>x</I><SUB>1</SUB>,&#X2026; , <I>x</I><SUB><I>n</I></SUB>], using <I>n</I> distinct parallel processes that compute <I>f</I>
over distinct vector items ([<I>f</I>(<I>x</I><SUB>1</SUB>),&#X2026; , <I>f</I>(<I>x</I><SUB><I>n</I></SUB>)]).<BR>
If <I>f</I> has type <TT>(unit -&gt; 'a stream -&gt; 'b stream)</TT>, and <I>n</I>
has type <TT>int</TT>, then <I>mapvector</I> (<I>f</I>, <I>n</I>) has type
<TT>unit -&gt; 'a array stream -&gt; 'b array stream</TT>.</P><P>In terms of (parallel) processes, a vector appearing onto
the input stream of a mapvector is split <I>n</I> elements and each element is
computed by one of the <I>n</I> workers. Workers apply <I>f</I> to the elements
they receive. A collector process
is in charge of gluing together all the results in a single result vector.</P><DIV CLASS="center">
<IMG SRC="UserManual008.gif">
</DIV><P>Different strategies can be used to distribute a vector <CODE>[|x1;...;xm|]</CODE>
appearing in the input data stream to the workers.
As an example the emitter:
</P><UL CLASS="itemize"><LI CLASS="li-itemize">
may round robin each <CODE>xi</CODE> to the workers
(<CODE>{w1,...wn}</CODE>). The workers in this case simply compute the
function <I>f</I> : '<I>a</I> &#X2192; '<I>b</I> over all the elements appearing onto
their input stream (channel).
</LI><LI CLASS="li-itemize">may split the input data vector in exactly <TT>n</TT>
sub-vectors to be delivered one to each one of the worker processes.
The workers in this case compute an <TT>Array.map</TT> <I>f</I> over all
the elements appearing onto their input stream (channel).
</LI></UL><P>Summarizing, the emitter process takes care of (sub)task-to-worker scheduling
(possibly implementing some kind of load balancing policy),
while the collector process takes care of rebuilding the vector with
the output data items and of delivering the new vector onto the output
data stream.
<TT>mapvector</TT> takes two arguments:
</P><UL CLASS="itemize"><LI CLASS="li-itemize">
the skeleton expression denoting the function to be applied to
all the vector elements, and
</LI><LI CLASS="li-itemize">the parallelism degree of the skeleton, i.e. the number of
processes to be used in the implementation.
</LI></UL><P>For instance, the following code works on a stream of integer vectors
and squares each vector element.
The skeleton has a parallelism degree of 10, that is ten
parallel processes are used to compute each vector in the stream.
</P><PRE CLASS="verbatim">let square _ x = x * x;;
let amap = parfun (fun () -&gt; mapvector(seq(square),10));;
</PRE><P>the result on a single array is as follows
</P><PRE CLASS="verbatim"># amap [|1;2;3;4;5|];;
- : int array = [|1; 4; 9; 16; 25|]
</PRE><!--TOC subsection The reduce skeleton-->
<H3 CLASS="subsection"><!--SEC ANCHOR --><A NAME="htoc14">1.3.7</A>&#XA0;&#XA0;The reduce skeleton</H3><!--SEC END --><P><A NAME="sec:red"></A>
The reduce skeleton is named <TT>reducevector</TT>; it folds
a function over all the data items of a vector.<BR>
Therefore, <TT>reducevector</TT> (&#X2295;, <I>n</I>) computes  <I>x</I><SUB>1</SUB> &#X2295; <I>x</I><SUB>2</SUB>
&#X2295; &#X2026; &#X2295; <I>x</I><SUB><I>n</I></SUB> out of the vector <I>x</I><SUB>1</SUB>, &#X2026;, <I>x</I><SUB><I>n</I></SUB>, for each
vector in the input data stream. The computation
is performed using <I>n</I> different parallel processes that compute <I>f</I>.<BR>
If &#X2295; has type <TT>(unit -&gt; 'a * 'a stream -&gt; 'a stream)</TT>,
and <I>n</I> has type <TT>int</TT>, then<BR>
<I>reducevector</I>(&#X2295;, <I>n</I>) has type <TT>unit -&gt; 'a array stream -&gt; 'a
stream</TT>. </P><P>In terms of (parallel) processes, a vector appearing onto
the input stream of a reducevector is processed by a logical tree of
processes. Each process is able to compute the binary operator
<I>g</I>. The resulting process network looks like the following tree:</P><DIV CLASS="center">
<IMG SRC="UserManual009.gif">
</DIV><P>In this case, the emitter process is the one delivering either couples
of input vector data items or couples of sub-vectors of the input
vector to the processes belonging to the tree base. In the former
case, log(<I>n</I>) levels of processes are needed in the tree, in the
latter one, any number of process levels can be used, and the number
of sub-vectors to be produced by the emitter can be devised
consequently.<BR>

The <TT>reducevector</TT> function takes two parameters as usual:
</P><UL CLASS="itemize"><LI CLASS="li-itemize">
the first parameter is the skeleton expression denoting the binary,
associative and commutative operation (these properties must be
ensured by the programmer to have a correct execution) 
</LI><LI CLASS="li-itemize">the second is the parallelism degree,
i.e. the number of parallel processes that have to be set up to
execute the <TT>reducevector</TT> computation.
</LI></UL><P>For instance, the following skeleton instance accepts in input a
stream of vectors and, for each vector, computes the sum of all
elements using the arithmetic + operator.
</P><PRE CLASS="verbatim">let areduce = parfun
  (fun () -&gt; reducevector(seq(fun _ (x,y) -&gt; x + y),10));;
</PRE><P>the result on a single array is as follows
</P><PRE CLASS="verbatim"># areduce [|1;2;3;4|];;
- : int = 10
</PRE><!--TOC subsection The parfun skeleton-->
<H3 CLASS="subsection"><!--SEC ANCHOR --><A NAME="htoc15">1.3.8</A>&#XA0;&#XA0;The parfun skeleton</H3><!--SEC END --><P>
<A NAME="sec:parfun"></A></P><P>One would expect <TT>parfun</TT> to have type <TT>(unit -&gt; 'a
stream -&gt; 'b stream) -&gt; 'a stream -&gt; 'b stream</TT>: given a skeleton
expression with type <TT>(unit -&gt; 'a stream -&gt; 'b stream)</TT>,
<TT>parfun</TT> returns a stream processing function of type
<TT>'a stream -&gt; 'b stream</TT>.</P><P><TT>parfun</TT>'s actual type introduces an extra level of
functionality: the argument is no more a skeleton expression but a
functional that returns a skeleton:</P><PRE>
val parfun :
  (unit -&gt; unit -&gt; 'a stream -&gt; 'b stream) -&gt; 'a stream -&gt; 'b stream
</PRE><P>This is necessary to guarantee that the skeleton wrapped in a
<TT>parfun</TT> expression will only be launched and instanciated by
the main program (<CODE>pardo</CODE>), not by any of the multiple running copies of the
SPMD binary, even though thoses copies may evaluate the
<TT>parfun</TT> skeletons; the main program will actually create the
needed skeletons by applying its functional argument, while the
generic copies will just throw the functional away, carefully avoiding
to instanciate the skeletons.</P><!--TOC subsection The pardo skeleton: a parallel scope delimiter-->
<H3 CLASS="subsection"><!--SEC ANCHOR --><A NAME="htoc16">1.3.9</A>&#XA0;&#XA0;The pardo skeleton: a parallel scope delimiter</H3><!--SEC END --><P>
<A NAME="sec:pardo"></A></P><!--TOC subsubsection Typing-->
<H4 CLASS="subsubsection"><!--SEC ANCHOR -->Typing</H4><!--SEC END --><P>
Finally, the <TT>pardo</TT> combinator defines the scope of the
expressions that may use the <TT>parfun</TT> encapsulated expressions.</P><PRE>
val pardo : (unit -&gt; 'a) -&gt; 'a
</PRE><P><TT>pardo</TT> takes a thunk as argument, and gives back the result of its
evaluation. As for the <TT>parfun</TT> combinator, this extra delay is
necessary to ensure that the initialization of the code will take
place exclusively in the main program and not in the generic SPMD
copies that participate to the parallel computation.</P><!--TOC subsubsection Parallel scoping rule-->
<H4 CLASS="subsubsection"><!--SEC ANCHOR -->Parallel scoping rule</H4><!--SEC END --><P>
In order to have the <CODE>parfun</CODE> and <CODE>pardo</CODE> work correctly together
the following scoping rule has to be sollowed:
</P><UL CLASS="itemize"><LI CLASS="li-itemize">functions defined via the <TT>parfun</TT> combinator
must be <EM>defined</EM> <EM>before</EM> the occurrence of the
<TT>pardo</TT> combinator,</LI><LI CLASS="li-itemize">those <TT>parfun</TT> defined functions can only be
<EM>executed</EM> <EM>within</EM> the body of the functional parameter of
the <TT>pardo</TT> combinator,</LI><LI CLASS="li-itemize">no <TT>parfun</TT> can be used directly inside a <TT>pardo</TT>
combinator.</LI></UL><!--TOC subsubsection Structure of an <FONT COLOR=purple>OcamlP3l</FONT> program -->
<H4 CLASS="subsubsection"><!--SEC ANCHOR -->Structure of an <FONT COLOR=purple>OcamlP3l</FONT> program </H4><!--SEC END --><P><A NAME="f11"></A></P><P>Due to the scoping rule in the pardo, the general structure of an <FONT COLOR=purple>OcamlP3l</FONT>
program looks like the following:</P><PRE>
(* (1) Functions defined using parfun *)
let f = parfun(<EM>skeleton expression</EM>)
let g = parfun(<EM>skeleton expression</EM>)

(* (2) code referencing these functions under abstractions *)

let h x = &#X2026; (f &#X2026;) &#X2026; (g &#X2026;) &#X2026;
&#X2026;<BR>
(* NO evaluation of code containing a parfun is allowed outside pardo *)
&#X2026;<BR>
(* (3) The pardo occurrence where parfun encapsulated
       functions can be called. *)
pardo
 (fun () -&gt;
    (* NO parfun combinators allowed here *)

    (* code evaluating parfun defined functions *)
    &#X2026;
    let a = f &#X2026;
    let b = h &#X2026;
    &#X2026;
 )
(* <EM>finalization of sequential code here</EM> *)
</PRE><P>At run time, in the sequential model, each generic copy just waits for
instructions from the main node; the main node first evaluates the
arguments of the <TT>parfun</TT> combinators to build a representation
of the needed skeletons; then, upon encountering the <TT>pardo</TT>
combinator, the main node initializes all the parallel computation
networks, specialising the generic copies (as described in details in
[<A HREF="#Ocamlp3lMlw98"><CITE>10</CITE></A>]), connects these networks to the sequential
interfaces defined in the <TT>parfun</TT>'s, and then runs the
sequential code in its scope by applying its function parameter to
<TT>():unit</TT>. The whole picture is illustrated in Figure
<A HREF="#fig:pardo">1.3</A>. The skeleton networks are initiated only once but
could be invoked many times during the execution of <TT>pardo</TT>.</P><!--TOC section Load balancing: the colors-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc17">1.4</A>&#XA0;&#XA0;Load balancing: the colors</H2><!--SEC END --><P>
<A NAME="ss:colors"></A></P><P>In the <FONT COLOR=purple>OcamlP3l</FONT> system, the combinators expressions govern the
shape of the process network and the execution model assumes a `virtual'
processor, for each process. 
The mapping of virtual to
physical processors is delegated to the <FONT COLOR=purple>OcamlP3l</FONT> system.
The mapping is currently not optimized in the system. However, programs and
machines can be annotated by the programmer using <EM>colors</EM>, which can
pilote the virtual-to-physical mapping process.</P><P>The idea is to have the programmer to rank the relative `weight' of
skeleton instances and the machine power in a range of integer values (the
colors). Then, weights are used to
generate a mapping in which load is evenly balanced on the partecipating
machine according to their relative power.</P><P>Pushing the difficult part of the generaion of weights
to the programmer's knowledge and ability, this
simple and practical idea gives surprisingly good results in practice.</P><P>Let's consider as an example, the skeletal expression we discussed in the
example (Sec&#XA0;<A HREF="#ss:example">1.2.3</A>):
</P><TABLE CLASS="display dcenter"><TR VALIGN="middle"><TD CLASS="dcell"><TT>farm</TT>&#XA0;&#XA0;(<TT>seq</TT>&#XA0;(<I>fun</I>&#XA0;<I>x</I>&#XA0;&#X2192;&#XA0;<I>x</I>&#XA0;*&#XA0;<I>x</I>),&#XA0;16)
</TD></TR>
</TABLE><P>
that corresponds to a network of one emitter node, one collector node,
and 16 worker nodes which compute the square function. There are
numerous ways of mapping a set of virtual nodes to a set of physical
nodes. 
</P><P>If no information is provided, the support uses a simple round robin,
which maps virtual to physical nodes in a cyclic way: first all phisical
processors get one process then we start again from the beginning until
virtual processors are all allocated.
Unfortunately, such a
solution doesn't take into account the load balancing constraints: all
the physical (resp. virtual) nodes are considered equivalent in
computing power and are used evenly. </P><P>If the programmer knows more about machines and processes he/she can tell
the system using colors.
A <EM>color</EM> is an optional integer parameter that is added to
<FONT COLOR=purple>OcamlP3l</FONT> expressions in the source program and to the execution
command line of the compiled program. We use the regular <FONT COLOR=purple>Ocaml</FONT>'s
optional parameters syntax, with keyword <EM>col</EM>, to specify the
colors of a network of virtual nodes. For example, writing
<CODE>farm ~col:k (f, n)</CODE>
means that all virtual nodes inside this farm structure
should be mapped to some physical nodes with a capability ranking
<I>k</I>. The scope of a color specification covers all the inner nodes of
the structure it qualifies: unless explicitely specified, the color of
an inside expression is simply inherited from the outer layer (the
outermost layer has a default color value of 0 which means no special
request).</P><P>For combinators <TT>farm</TT>, <TT>mapvector</TT> and
<TT>reducevector</TT>, in addition to the color of the combinator
itself, their is an additional optional color parameter <I>colv</I>. A
<I>colv</I>[ ] specification is a <TT>color list</TT> (i.e. an <TT>int
list</TT>) that specifies the colors of the parallel worker structures
that are arguments of the combinator. As an example, the <FONT COLOR=purple>OcamlP3l</FONT>
expression</P><PRE>
map ~col:2 ~colv:[ 3; 4; 5; 6 ] (seq f, 4)
</PRE><P>
is a <TT>mapvector</TT> skeleton expression, with emitter and
collector nodes of rank 2, and four worker nodes (four copies of
<CODE>seq f</CODE>) whith respective ranks 3, 4, 5, and 6.</P><P>To carefully map virtual nodes to physical nodes, we also need a way 
to define the colors of physical nodes. This information is specified 
on the command line when launching the program (see Section&#XA0;<A HREF="#cap:run">2</A>). One can write:</P><PRE>
prog.par -p3lroot ip1:port1#color1 ip2:port2#color2 ... \
                  ip_i:port_i#color_i ...
</PRE><P>
where <TT>ip_i:port_i#color_i</TT> indicates the ip address (or
name), the port, and the color of the physical node <TT>i</TT>
participating to the computation. The port and color here are both
optional. With no specified port, a default p3lport is used; with no
color specification, the default color 0 is assumed.</P><P>If the colors of all the virtual processors and all the physical
processors have a one-to-one correspondance, the mapping is easy. But
such a perfect mapping does not exist in general: first of all, there
is not always equality between the amount of physical processors we
have and the amount of virtual processors we need; second, in some
very complex <FONT COLOR=purple>OcamlP3l</FONT> expressions, it is complex and boring for
the programmer to calculate manually how many virtual nodes are needed
for each color class.</P><P>So, we decided to use a simple but flexible mapping algorithm, based on
the idea that <EM>what a color means is not the </EM><EM><B>exact</B></EM><EM>
capability required but the </EM><EM><B>lowest</B></EM><EM> capability
acceptable</EM>. For example, a virtual node with color value 5 means a
physical node of color 5 is needed, but if there is no physical node
with value 5, and there exists a physical node of color 6 free and
available, why don't we take it instead? In practice, we sort the virtual
nodes in decreasing order of their color values, to reflect their
priority in choosing a physical node: virtual nodes with bigger colors
should have more privilege and choose their physical node before the
nodes with smaller colors. Then, for each virtual node, we lists all
the physical nodes with a color greater than or equal to the virtual
node color. Among all those qualified ones, the algorithm finally
associates the virtual node with the qualified node which has the
smallest work load (the one that has the least number of virtual nodes
that have been assigned to it).</P><P>This algorithm provides a mapping process with some degree of
automatization and some degree of manual tuning, but one has to keep
in mind that the <EM>color</EM> designs a computational class,
qualitatively, and is not an exact quantitative estimation of the
computational power of the machine, as the current version of
<FONT COLOR=purple>OcamlP3l</FONT> does not provide yet the necessary infrastructure to
perform an optimal mapping based on precise quantitative estimations
of the cost of each sequential function and the capabilities of the
physical nodes, so that we cannot guarantee our color-based mapping
algorithm to be highly accurate or highly effective.</P><P>Still, the &#X201C;color&#X201D; approach is accurate and simple enough to be
quite significant to the programmer: according to the experiments we
have conducted, it indeed achieved some satisfactory results in our
test bed case (see [<A HREF="#clement04"><CITE>5</CITE></A>]).</P><P>Figure <A HREF="#t:full">1.8</A> summarizes the types of the combinators, exactly
as they are currently available to the programmer in the
<TT>2.0</TT> version of <FONT COLOR=purple>OcamlP3l</FONT>, including the optional color
parameters.</P><BLOCKQUOTE CLASS="figure"><DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV>
<PRE>
type color = int

val seq :
  ?col:color -&gt;
  (unit -&gt; 'a -&gt; 'b) -&gt; unit -&gt; 'a stream -&gt; 'b stream
val ( ||| ) :
  (unit -&gt; 'a stream -&gt; 'b stream) -&gt;
  (unit -&gt; 'b stream -&gt; 'c stream) -&gt; unit -&gt; 'a stream -&gt; 'c stream
val loop :
  ?col:color -&gt;
  ('a -&gt; bool) * (unit -&gt; 'a stream -&gt; 'a stream) -&gt;
  unit -&gt; 'a stream -&gt; 'a stream
val farm :
  ?col:color -&gt;
  ?colv:color list -&gt;
  (unit -&gt; 'b stream -&gt; 'c stream) * int -&gt;
  unit -&gt; 'b stream -&gt; 'c stream
val mapvector :
  ?col: color -&gt;
  ?colv:color list -&gt;
  (unit -&gt; 'b stream -&gt; 'c stream) * int -&gt;
  unit -&gt; 'b array stream -&gt; 'c array stream
val reducevector :
  ?col:color -&gt;
  ?colv:color list -&gt;
  (unit -&gt; ('b * 'b) stream -&gt; 'b stream) * int -&gt;
  unit -&gt; 'b array stream -&gt; 'b stream
val parfun :
  (unit -&gt; unit -&gt; 'a stream -&gt; 'b stream) -&gt; 'a stream -&gt; 'b stream
val pardo : (unit -&gt; 'a) -&gt; 'a
</PRE>
<DIV CLASS="caption"><TABLE CELLSPACING=6 CELLPADDING=0><TR><TD VALIGN=top ALIGN=left>Figure 1.8: The (complete) types of the <FONT COLOR=purple>OcamlP3l</FONT> skeleton combinators <A NAME="t:full"></A></TD></TR>
</TABLE></DIV>
<DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV></BLOCKQUOTE><!--BEGIN NOTES chapter-->
<HR CLASS="footnoterule"><DL CLASS="thefootnotes"><DT CLASS="dt-thefootnotes">
<A NAME="note4" HREF="#text4">1</A></DT><DD CLASS="dd-thefootnotes">See 
<TT>http://pauillac.inria.fr/ocaml/</TT>
</DD><DT CLASS="dt-thefootnotes"><A NAME="note5" HREF="#text5">2</A></DT><DD CLASS="dd-thefootnotes">See
<TT>http://www.di.unipi.it/.susanna/p3l.html</TT>
</DD><DT CLASS="dt-thefootnotes"><A NAME="note6" HREF="#text6">3</A></DT><DD CLASS="dd-thefootnotes">See <TT>http://www.dicosmo.org/ocamlp3l/</TT> for relevant
information, up to date references, documentation, examples,
distribution code and dynamic web pages showcasing the <FONT COLOR=purple>OcamlP3l</FONT>
 features.
</DD><DT CLASS="dt-thefootnotes"><A NAME="note7" HREF="#text7">4</A></DT><DD CLASS="dd-thefootnotes">In practice, the initialization step may do weird, non referentially
transparent things, like opening file descriptors or negociating a
network connection to other services: it is then crucial to allow
the different instances of the user's function to have their own local
descriptors or local connections to simply avoid the chaos.
</DD></DL>
<!--END NOTES-->
<!--TOC chapter Running your <FONT COLOR=purple>OcamlP3l</FONT> program-->
<H1 CLASS="chapter"><!--SEC ANCHOR --><A NAME="htoc18">Chapter&#XA0;2</A>&#XA0;&#XA0;Running your <FONT COLOR=purple>OcamlP3l</FONT> program</H1><!--SEC END --><P>
<A NAME="cap:run"></A></P><P>We give here a practical tutorial on how to use the system, without entering
into the implementation details of the current version of <FONT COLOR=purple>OcamlP3l</FONT>.</P><P>As mentioned above, once you have written an <FONT COLOR=purple>OcamlP3l</FONT> program, you have
several choices for its execution, since you can <EM>without touching your
source</EM>:</P><DL CLASS="description"><DT CLASS="dt-description"><B>sequential</B></DT><DD CLASS="dd-description"> run your program sequentially on one machine, to test the
logic of the algorithm you implemented with all the usual sequential
debugging tools.</DD><DT CLASS="dt-description"><B>graphics</B></DT><DD CLASS="dd-description"> get a picture of the processor net described by your
<FONT COLOR=purple>OcamlP3l</FONT>skeleton expression to grasp the parallel structure of your
program.</DD><DT CLASS="dt-description"><B>parallel</B></DT><DD CLASS="dd-description"> run your program in parallel over a network of workstations
after a simple recompilation.</DD></DL><P>
Presumably, you would run the parallel version once the program has
satisfactorily passed the sequential debugging phase.</P><P>In the following sections, our running example is the computation of a
Mandelbrot fractal set. We will describe the implementation program, compile it
and run it in the three ways described above.</P><!--TOC section The Mandelbrot example program-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc19">2.1</A>&#XA0;&#XA0;The Mandelbrot example program</H2><!--SEC END --><P>The Mandelbrot example program performs the calculation of the Mandelbrot set
at a given resolution in a given area of the graphic display. This is the
actual program provided in the Examples directory of the distribution.</P><P>The computing engine of the program is the function <TT>pixel_row</TT> which
computes the color of a row of pixels from the convergence of a sequence of
complex numbers <I>z</I><SUB><I>n</I></SUB> defined by the initial term <I>z</I><SUB>0</SUB> and the formula <I>z</I><SUB><I>n</I> +
1</SUB> = <I>z</I><SUB><I>n</I></SUB><SUP>2</SUP> + <I>z</I><SUB>0</SUB>. More precisely, given a point <I>p</I> in the complex plane,
we associate to <I>p</I> the sequence <I>z</I><SUB><I>n</I></SUB> when starting with <I>z</I><SUB>0</SUB> = <I>p</I>. Now, we
compute the integer <I>m</I> such that <I>z</I><SUB><I>m</I></SUB> is the first term of the sequence
satisfying the following condition: either the sum of the real and imaginary
parts of <I>z</I><SUB><I>n</I></SUB> exceeds a given threshold, or the number of iterations exceeds
some fixed maximum <EM>resolution</EM> limit. Integer <I>m</I> defines the color of
<I>p</I>.</P><P><I>p</I>). This correspond to the following <FONT COLOR=purple>Ocaml</FONT> code:</P><PRE>
open Graphics;;

let n   = 300;; (* the size of the square screen windows in pixels      *)
let res = 100;; (* the resolution: maximum number of iterations allowed *)

(* convert an integer in the range 0..res into a screen color *)

let color_of c res = Pervasives.truncate 
      (((float c)/.(float res))*.(float Graphics.white));;

(* compute the color of a pixel by iterating z_m+1=z_m^2+c             *)
(* j is the k-th row, initialized so that j.(i),k  are the coordinates *)
(* of the pixel (i,k)                                                  *)

let pixel_row (j,k,res,n) = 
  let zr = ref 0.0 in
  let zi = ref 0.0 in
  let cr = ref 0.0 in
  let ci = ref 0.0 in
  let zrs = ref 0.0 in
  let zis = ref 0.0 in
  let d   = ref (2.0 /. ((float  n) -. 1.0)) in
  let colored_row = Array.create n (Graphics.black) in

  for s = 0 to (n-1) do
    let j1 = ref (float  j.(s)) in
    let k1 = ref (float  k) in
    begin
      zr := !j1 *. !d -. 1.0;
      zi := !k1 *. !d -. 1.0;
      cr := !zr;
      ci := !zi;
      zrs := 0.0;
      zis := 0.0;
      for i=0 to (res-1) do
        begin
          if(not((!zrs +. !zis) &gt; 4.0))
          then 
            begin
              zrs := !zr *. !zr;
              zis := !zi *. !zi;
              zi  := 2.0 *. !zr *. !zi +. !ci;
              zr  := !zrs -. !zis +. !cr;
              Array.set colored_row s (color_of i res);
            end;
        end
      done
    end
  done;
  (colored_row,k);;
</PRE><P>In this code, the global complex interval sampled stays within <TT>(-1.0,
-1.0)</TT> and <TT>(1.0, 1.0)</TT>. In this 2-unit wide square, the <TT>pixel_row</TT>
functions computes rows of pixels separated by the distance <TT>d</TT>. The <TT>pixel_row</TT> function takes four parameters: <TT>size</TT>, the number of pixels
in a row; <TT>resolution</TT>, the resolution; <TT>k</TT>, the index of the row to
be drawn; and, <TT>j</TT>, an array which will be filled with the integers
representing the colors of pixels in the row. These values will be converted
into real colors by the <TT>color_row</TT> function. In this program, the
threshold is fixed to be <TT>4.0</TT>. We name <TT>zr</TT> and <TT>zi</TT> the real and
imaginary parts of <I>z</I><SUB><I>i</I></SUB>; similarly, the real and imaginary parts of <I>c</I> are
<TT>cr</TT> and <TT>ci</TT>; <TT>zrs</TT> and <TT>zis</TT> are temporary variables for the
square of <TT>zr</TT> and <TT>zi</TT>; <TT>d</TT> is the distance between two rows.</P><P>The Mandelbrot computation over the whole set of points within <TT>(-1.0,-1.0)</TT> and <TT>(1.0,1.0)</TT> in the complex plane can be computed in
parallel exploiting farm parallelism. The set of points is split by
<TT>gen_rows</TT> into a bunch of pixel rows that build up the input stream,
the computation of the Mandelbrot set on each row of complex points is
independent and can be performed by the worker processes using <TT>pixel_row</TT>
and the result is a stream of rows of pixel colors, each corresponding to an
input pixel row.</P><PRE>
(* draw a line on the screen using fast image functions *)
let show_a_result r =
  match r with
    (col,j) -&gt;
      draw_image (make_image [| col |]) 0 j;;

(* generate the tasks *)
let gen_rows = 
  let seed = ref 0 in
  let ini = Array.create n 0 in
  let iniv = 
    for i=0 to (n-1) do
      Array.set ini i i
    done; ini in
  (function () -&gt; 
    if(!seed &lt; n) 
    then let r = (iniv,!seed,res,n) in (seed:=!seed+1;r)
    else raise End_of_file)
;;
</PRE><P>The actual farm is defined by the <TT>mandel</TT> function which uses the
<TT>parfun</TT> skeleton to transform a farm instance with 10 workers into an
<FONT COLOR=purple>Ocaml</FONT> sequential function. Notice that the <TT>seq</TT>
skeleton has been used to turn the <TT>pixel_row</TT> function into a stream
process, which can be used to instantiate a skeleton.
Finally the <TT>pardo</TT> skeleton takes care of opening/closing a
display window on the end-node (the one running <TT>pardo</TT>), and of
actually activating the farm invoking <TT>mandel</TT>. The function
<TT>show_a_result</TT> actually displays a pixel row on the end-node. Notice
that this
code would need to be written anyway, maybe arranged in a different way, for a
purely sequential implementation.<BR>


</P><PRE>
(* the skeleton expression to compute the image *)
let mandel = parfun (fun () -&gt; farm(seq(pixel_row),10));;

pardo (fun () -&gt;
  print_string "opening...";print_newline();
  open_graph (" "^(string_of_int n)^"x"^(string_of_int n));

  (* here we do the parallel computation *)
  List.iter show_a_result 
        (P3lstream.to_list (mandel (P3lstream.of_fun gen_rows)));

  print_string "Finishing";print_newline();
  for i=0 to 50000000 do let _ =i*i in () done;
  print_string "Finishing";print_newline();
  close_graph()
)
</PRE><!--TOC section Sequential execution-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc20">2.2</A>&#XA0;&#XA0;Sequential execution</H2><!--SEC END --><P>We assume the program being written in a file named <TT>mandel.ml</TT>. We
compile the sequential version using <TT>ocamlp3lcc</TT> as follows:</P><PRE CLASS="verbatim">ocamlp3lcc --sequential mandel
</PRE><DIV CLASS="theorem"><B>Remark&#XA0;1</B>&#XA0;&#XA0;<EM>
In the current implementation, this boils down to adding on top of
</EM><EM><TT>mandel.ml</TT></EM><EM> the line</EM><PRE CLASS="verbatim"><EM>open Seqp3l;;
</EM></PRE><P><EM><EM>to obtain a temporary file </EM></EM><EM><EM><TT>mandel.seq.ml</TT></EM></EM><EM><EM> which is then compiled via
the regular Caml compiler </EM></EM><EM><EM><TT>ocamlc</TT></EM></EM><EM><EM> with the proper modules and libraries
linked. Depending on the configuration of your system, this may look like the
following</EM></EM></P><PRE CLASS="verbatim"><EM><EM>ocamlc -custom unix.cma graphics.cma seqp3l.cmo 
       -o mandel.seq mandel.seq.ml 
       -cclib -lunix -cclib -lgraphics -cclib -L/usr/X11R6/lib 
       -cclib -lX11
</EM></EM></PRE><P><EM><EM>We highly recommend not to use explicit call to </EM></EM><EM><EM><TT>ocamlc</TT></EM></EM><EM><EM>: use the
</EM></EM><EM><EM><TT>ocamlp3lcc</TT></EM></EM><EM><EM> compiler that is especially devoted to the compilation of
</EM></EM><EM><EM><FONT COLOR=purple>OcamlP3l</FONT></EM></EM><EM><EM>programs. </EM></EM><EM><EM>&#X22C4;</EM></EM><EM><EM>
</EM></EM></P></DIV><P>After the compilation, we get an executable file, <TT>mandel.seq</TT>, whose
execution produces the picture shown on the left side of <A HREF="#f:mandelparseq">2.1</A>.
</P><BLOCKQUOTE CLASS="figure"><DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV>
<IMG SRC="UserManual010.gif">
<DIV CLASS="caption"><TABLE CELLSPACING=6 CELLPADDING=0><TR><TD VALIGN=top ALIGN=left>Figure 2.1: A snapshot of the execution of <TT>mandel.ml</TT> (left is sequential execution, right is parallel execution on 5 machines).</TD></TR>
</TABLE></DIV>
<A NAME="f:mandelparseq"></A>
<DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV></BLOCKQUOTE><!--TOC section Graphical execution-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc21">2.3</A>&#XA0;&#XA0;Graphical execution</H2><!--SEC END --><P>It is often useful to look at the structure of the application process network,
for example when tuning the performance of the final program. In <FONT COLOR=purple>OcamlP3l</FONT>,
this can be done by compiling the program with the special option
<TT>&#X2013;graphical</TT> which automatically creates a picture displaying the
`logical' parallel program structure.</P><PRE CLASS="verbatim">ocamlp3lcc --graphical mandel.ml
</PRE><DIV CLASS="theorem"><B>Remark&#XA0;1</B>&#XA0;&#XA0;<P><EM>In the current implementation, this boils down to adding on top of
</EM><EM><TT>mandel.ml</TT></EM><EM> the line</EM></P><PRE CLASS="verbatim"><EM>open Grafp3l;;
</EM></PRE><P><EM>to obtain a temporary file </EM><EM><TT>mandel.gra.ml</TT></EM><EM> which is then compiled via
</EM><EM><TT>ocamlc</TT></EM><EM> with the proper modules and libraries. Depending on the
configuration of your system, this may look like the following</EM></P><PRE CLASS="verbatim"><EM>ocamlc -custom graphics.cma grafp3l.cmo -o mandel.gra mandel.gra.ml 
       -cclib -lgraphics -cclib -L/usr/X11R6/lib -cclib -lX11
</EM></PRE><P><EM>Once more, we highly recommend not to use explicit calls to </EM><EM><TT>ocamlc</TT></EM><EM>: use the
</EM><EM><TT>ocamlp3lcc</TT></EM><EM> compiler that is especially devoted to the compilation of
</EM><EM><FONT COLOR=purple>OcamlP3l</FONT></EM><EM>programs. </EM><EM>&#X22C4;</EM><EM>
</EM></P></DIV><P>After compilation, we get the executable file <TT>mandel.gra</TT>, whose
execution produces the following picture.
</P><DIV CLASS="center">
<IMG SRC="UserManual011.gif">
</DIV><!--TOC section Parallel execution-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc22">2.4</A>&#XA0;&#XA0;Parallel execution</H2><!--SEC END --><P>Once we have checked the sequential version of our code, and got a picture of the
structure of the parallel network, we are ready to speed up the computation by
using a network of computers.</P><!--TOC subsection Compilation for parallel execution-->
<H3 CLASS="subsection"><!--SEC ANCHOR --><A NAME="htoc23">2.4.1</A>&#XA0;&#XA0;Compilation for parallel execution</H3><!--SEC END --><P>We call the compiler with the special option <TT>&#X2013;parallel</TT> devoted
to compilation for parallel execution:</P><PRE CLASS="verbatim">ocamlp3lcc --parallel mandel
</PRE><DIV CLASS="theorem"><B>Remark&#XA0;1</B>&#XA0;&#XA0;<EM>
In the current implementation this boils down to adding on top of
</EM><EM><TT>mandel.ml</TT></EM><EM> the lines</EM><PRE CLASS="verbatim"><EM>open Parp3l;;
open Nodecode;;
open Template;;
</EM></PRE><P><EM><EM>to obtain a temporary file </EM></EM><EM><EM><TT>mandel.par.ml</TT></EM></EM><EM><EM> which is then compiled via
</EM></EM><EM><EM><TT>ocamlc</TT></EM></EM><EM><EM> with the proper modules and libraries. Depending on the
configuration of your system, this may look like the following</EM></EM></P><PRE CLASS="verbatim"><EM><EM>ocamlc -custom unix.cma p3lpar.cma -o mandel.par mandel.par.ml 
       -cclib -lunix -cclib -lgraphics -cclib -L/usr/X11R6/lib
       -cclib -lX11
</EM></EM></PRE><P><EM><EM>Once again, we highly recommend not to use explicit calls to </EM></EM><EM><EM><TT>ocamlc</TT></EM></EM><EM><EM>: use the
</EM></EM><EM><EM><TT>ocamlp3lcc</TT></EM></EM><EM><EM> compiler that is especially devoted to the compilation of
</EM></EM><EM><EM><FONT COLOR=purple>OcamlP3l</FONT></EM></EM><EM><EM>programs. </EM></EM><EM><EM>&#X22C4;</EM></EM><EM><EM>
</EM></EM></P></DIV><P>The compilation produces an executable file named <TT>mandel.par</TT>.</P><!--TOC section Common options-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc24">2.5</A>&#XA0;&#XA0;Common options</H2><!--SEC END --><P>The parallel compilation of <FONT COLOR=purple>OcamlP3l</FONT> programs creates executables that are
equipped with the following set of predefined options:</P><UL CLASS="itemize"><LI CLASS="li-itemize"><CODE>-p3lroot</CODE>, to declare this invocation of the program as the root node.</LI><LI CLASS="li-itemize"><CODE>-dynport</CODE>, to force this node to use a dynamic port number instead
of the default <CODE>p3lport</CODE>; in addition the option outputs it (useful if
you want to run more slave copies on the same machine).</LI><LI CLASS="li-itemize"><CODE>-debug</CODE>, to enable debugging for this node at level <I>n</I>. Currently all
levels are equal.</LI><LI CLASS="li-itemize"><CODE>-ip</CODE>, to force the usage of a specified ip address. Useful when you
are on a laptop named localhost and you want to be able to choose among
network interfaces.</LI><LI CLASS="li-itemize"><CODE>-strict</CODE>, to specify a strict mapping between physical and virtual
processors.</LI><LI CLASS="li-itemize"><CODE>-version</CODE>, to print version information.</LI><LI CLASS="li-itemize"><CODE>-help</CODE> or <CODE>--help</CODE> Displays this list of options.</LI></UL><!--TOC subsection Parallel computation overview-->
<H3 CLASS="subsection"><!--SEC ANCHOR --><A NAME="htoc25">2.5.1</A>&#XA0;&#XA0;Parallel computation overview</H3><!--SEC END --><P>The executable produced by using the <TT>&#X2013;parallel</TT> option of the compiler
behaves either as a generic computation node, or as the unique <EM>root
configuration node</EM>, according to the set of arguments provided at launch
time.</P><P>To set up and launch the parallel computation network, we need to run multiple
invocations of the parallel executable:
</P><UL CLASS="itemize"><LI CLASS="li-itemize">run one copy instance of <TT>mandel.par</TT>, with no arguments,
on each machine that takes part to the parallel computation;
These processes wait for configuration information sent by the designated
<EM>root node</EM>,</LI><LI CLASS="li-itemize">create a root node, by launching one extra copy of <TT>mandel.par</TT>
with the special option <TT>-p3lroot</TT>.</LI></UL><P>
As soon as created, the root node configures all other participating nodes and
then executes locally the <CODE>pardo</CODE> encapsulated sequential code.</P><P>In addition to the <TT>-p3lroot</TT> special option, the root node invocation
must specify the information concerning the machines involved in the
computational network (their ip address or name, their port and color).</P><!--TOC section Launching the parallel computation-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc26">2.6</A>&#XA0;&#XA0;Launching the parallel computation</H2><!--SEC END --><P>Here is a simple script to launch the parallel network on several machines:

</P><PRE CLASS="verbatim">#!/bin/sh

# The list of machines
NODES="machine1 machine2 machine3 machine4"
# The name of executable to be launched
PAR="./mandel.par"

echo -n "Launching OcamlP3L $PAR on the cluster:" 
for NODE in $NODES; do   #(*1*)
    echo -n " $NODE"
#launching a generic computation node on each machine
    ssh $NODE $PAR 1&gt; log-$NODE 2&gt; err-$NODE &amp; 

# a possible coloring of machines
    case $NODE in                                   #(*2*)
        machine1) COLORED_NODES="$COLOREDNODES $NODE#1";;
        *) COLORED_NODES="$COLOREDNODES $NODE#2";;
    esac
done

echo "Starting computation with $COUNT node(s): $COLORED_NODES..."
# launch the unique root configuration node #(*3*)
$PAR -p3lroot $COLOREDNODES 1&gt; log-root 2&gt; err-root

echo "Finished."
</PRE><P>
This script assumes <CODE>mandel.par</CODE> to be accessible to all participating
machines and does the following:
</P><UL CLASS="itemize"><LI CLASS="li-itemize">
runs <CODE>mandel.par</CODE> on all participating machines
(<CODE>#(*1*)</CODE>),
</LI><LI CLASS="li-itemize">generates a coloring for participating nodes (<CODE>#(*2*)</CODE>),
</LI><LI CLASS="li-itemize">launches the computation starting the root process on the local machine
(<CODE>#(*3*)</CODE>) providing the list of colored participating hosts.
</LI></UL><P>In future versions, especially those incorporating the MPI communication layer,
the startup mechanism will possibly work differently (typically, the
initialization steps will be performed by the MPI layer).</P><!--TOC section Common errors-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc27">2.7</A>&#XA0;&#XA0;Common errors</H2><!--SEC END --><P>
<A NAME="sec:commonerrors"></A></P><P>A few words of warning now: even if the user program is now easy to write,
compile and execute, you should not forget that the underlying machinery is
quite sophisticated, and that in some situations you may not get what you
expected. Two typical problems you may encounter are the following:</P><DL CLASS="description"><DT CLASS="dt-description"><B>output value: code mismatch</B></DT><DD CLASS="dd-description"> If you see this error in the
parallel execution of your program, it means that two incompatible
versions of your program are trying to communicate. <FONT COLOR=purple>Ocaml</FONT> uses an
MD5 check of the code area before sending closures over a channel,
because this operation only makes sense between &#X201C;identical&#X201D;
programs.<BR>
 Two possible reasons for the error are:
<UL CLASS="itemize"><LI CLASS="li-itemize">
an old version of your program is still running somewhere and
is trying to communicate with the newer version you are
running now. You should kill all the running processes and try
again.
</LI><LI CLASS="li-itemize">you are running copies of the program compiled for different
architectures. This feature is not yet supported, and you
should run the program on homogeneous architectures.
</LI></UL></DD><DT CLASS="dt-description"><B>references</B></DT><DD CLASS="dd-description"> You should remember that the user functions provided
to the skeletons will be all executed on different machines, so
their behaviour <EM>must not</EM> rely on the existence of
implicitly shared data, like global references: if you do, the
sequential behaviour and the parallel one will be different.
This does not imply that all user function be real functions
(you can use local store to keep a counter for example), but an
access to a global reference is certainly a mistake (since every node will
access its <EM>own private</EM> copy of the data, thus defeating the
purpose of the shared data).</DD></DL><!--TOC chapter More programming examples-->
<H1 CLASS="chapter"><!--SEC ANCHOR --><A NAME="htoc28">Chapter&#XA0;3</A>&#XA0;&#XA0;More programming examples</H1><!--SEC END --><P><A NAME="cap:exe"></A></P><!--TOC section Generating and consuming streams-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc29">3.1</A>&#XA0;&#XA0;Generating and consuming streams</H2><!--SEC END --><P>
Streams to be feed to the parallel networks can be created and consumed using
functions in <TT>P3lstream</TT>. Main functions are as follows:
</P><BLOCKQUOTE CLASS="table"><DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV>
<TABLE BORDER=1 CELLSPACING=0 CELLPADDING=1><TR><TD ALIGN=left NOWRAP>Function</TD><TD ALIGN=left NOWRAP>Description</TD><TD ALIGN=left NOWRAP>Secs</TD></TR>
<TR><TD ALIGN=left NOWRAP><TT>P3lstream.of_list</TT></TD><TD ALIGN=left NOWRAP>transforms a list in a valid stream</TD><TD ALIGN=left NOWRAP><A HREF="#sec:inputuno">3.1.1</A>, <A HREF="#sec:inputoutputdafile">3.1.2</A>,</TD></TR>
<TR><TD ALIGN=left NOWRAP>&nbsp;</TD><TD ALIGN=left NOWRAP>&nbsp;</TD><TD ALIGN=left NOWRAP><A HREF="#sec:mixingUnix">3.5</A></TD></TR>
<TR><TD ALIGN=left NOWRAP><TT>P3lstream.iter</TT></TD><TD ALIGN=left NOWRAP>iterates on all elements of a stream</TD><TD ALIGN=left NOWRAP><A HREF="#sec:inputuno">3.1.1</A>, <A HREF="#sec:inputoutputdafile">3.1.2</A>,</TD></TR>
<TR><TD ALIGN=left NOWRAP>&nbsp;</TD><TD ALIGN=left NOWRAP>&nbsp;</TD><TD ALIGN=left NOWRAP><A HREF="#sec:of_fun">3.1.3</A></TD></TR>
<TR><TD ALIGN=left NOWRAP><TT>P3lstream.of_fun</TT></TD><TD ALIGN=left NOWRAP>allows stream generation</TD><TD ALIGN=left NOWRAP>&nbsp;</TD></TR>
<TR><TD ALIGN=left NOWRAP>&nbsp;</TD><TD ALIGN=left NOWRAP>iterating a sequential
function</TD><TD ALIGN=left NOWRAP>&nbsp;</TD></TR>
<TR><TD ALIGN=left NOWRAP>&nbsp;</TD><TD ALIGN=left NOWRAP>which explicitely raises <CODE>End_Of_File</CODE></TD><TD ALIGN=left NOWRAP><A HREF="#sec:of_fun">3.1.3</A></TD></TR>
<TR><TD ALIGN=left NOWRAP><TT>P3lstream.to_list</TT></TD><TD ALIGN=left NOWRAP>transforms streams into lists</TD><TD ALIGN=left NOWRAP><A HREF="#sec:to_list">3.1.4</A></TD></TR>
</TABLE>
<DIV CLASS="center"><HR WIDTH="80%" SIZE=2></DIV></BLOCKQUOTE><!--TOC subsection Generating streams from lists-->
<H3 CLASS="subsection"><!--SEC ANCHOR --><A NAME="htoc30">3.1.1</A>&#XA0;&#XA0;Generating streams from lists</H3><!--SEC END --><P>
<A NAME="sec:inputuno"></A> 
</P><PRE CLASS="verbatim">let rec generate_list_of_float n s =
  if ( n &lt;= 0 ) then []
  else s :: (generate_list_of_float (n-1) (s +. 1.0))
;;

let initseq n s = P3lstream.of_list (generate_list_of_float n s);;

let finseq y =
  P3lstream.iter (fun x -&gt; print_float x; print_newline()) y;;

(* Define stage1 and stage 2, can be anything......*)
let stage1 _ x = x +. x;;
let stage2 _ x = x *. x;;

(* definizione network del pipe *)
let pipe = parfun (fun () -&gt; seq(stage1) ||| seq(stage2));;

(* pardo activation *)
pardo
  (fun () -&gt;
     let  s = initseq 10 1.0 in
     let y = pipe s in
       finseq y
  );;
</PRE><!--TOC subsection Generating streams from files-->
<H3 CLASS="subsection"><!--SEC ANCHOR --><A NAME="htoc31">3.1.2</A>&#XA0;&#XA0;Generating streams from files</H3><!--SEC END --><P>
<A NAME="sec:inputoutputdafile"></A>
</P><PRE CLASS="verbatim">let read_input_float fd =
  Scanf.fscanf fd "%f" (fun n -&gt; n);;

let rec fgenerate_list_of_float cin n=
    if ( n &lt;= 0 ) then []
    else 
      (try
  (let s = (read_input_float cin) in
            Printf.printf "Ecco %f\n" s;
     s:: (fgenerate_list_of_float cin (n-1)))
       with
    End_of_file -&gt; [];
      )
;;


(* stream generation *)
let initseq cin n =  
      P3lstream.of_list (fgenerate_list_of_float cin n);;

(* prints out stream *)
let finseq cout y =
  P3lstream.iter (fun x -&gt; Printf.fprintf cout "%f" x; output_string cout "\n") y;;

(* Defines stage1 and stage 2 *)
let stage1 _ x = x +. x;;
let stage2 _ x = x *. x;;

(* defining a two stage pipe network *)
let pipe = parfun (fun () -&gt; seq(stage1) ||| seq(stage2));;

(* pardo activation *)
pardo
  (fun () -&gt;
     let cin = open_in "pippo" and cout = open_out "pluto" in
     let  s = initseq cin 10 in
     let y = pipe s in
       finseq cout y
  )
;;

</PRE><!--TOC subsection Generating streams repeatedly calling a function-->
<H3 CLASS="subsection"><!--SEC ANCHOR --><A NAME="htoc32">3.1.3</A>&#XA0;&#XA0;Generating streams repeatedly calling a function</H3><!--SEC END --><P><A NAME="sec:of_fun"></A>
Here, the stream is generated by repeatedly calling function
<TT>generate_input_stream</TT> via <CODE>P3lstream.of_fun</CODE>.

</P><PRE CLASS="verbatim"> %quinto-esempioUM.ml
(* generating the input stream calling repetedly a function *)
let generate_input_stream =
  let x = ref 0.0 in
  (function () -&gt; 
    begin
      x := !x +. 1.0;
      if(!x &lt; 10.0) then !x else raise End_of_file
    end);;

(* prints out an integer stream *)
let finseq y =
  P3lstream.iter (fun x -&gt; print_float x; print_newline()) y;;

(* Defines stage1 and stage 2 as identity*)
let stage1 _ x = x;;
let stage2 _ x = x;;

let pipe = parfun (fun () -&gt; seq(stage1) ||| seq(stage2));;

(* pardo activation *)
pardo
  (fun () -&gt;
     let s = P3lstream.of_fun generate_input_stream in     
     let y = pipe s in
       finseq y
  )
;;
</PRE><!--TOC subsection Transforming streams into lists-->
<H3 CLASS="subsection"><!--SEC ANCHOR --><A NAME="htoc33">3.1.4</A>&#XA0;&#XA0;Transforming streams into lists</H3><!--SEC END --><P><A NAME="sec:to_list"></A>

</P><PRE CLASS="verbatim">(* generating the input stream calling repeatedly a function *)
let generate_input_stream =
  let x = ref 0.0 in
  (function () -&gt; 
    begin
      x := !x +. 1.0;
      if(!x &lt; 10.0) then !x else raise End_of_file
    end);;

(* Defines stage1 and stage 2*)
let stage1 _ x = x +. 1.;;
let stage2 _ x = x +. 7.;;

let print_result x =
  print_float x; print_newline();;

let pipe = parfun (fun () -&gt; seq(stage1) ||| seq(stage2));;

(* pardo activation *)
pardo
  (fun () -&gt;
     let is = P3lstream.of_fun generate_input_stream in
     (* transforms a stream into a list *)
     let l = P3lstream.to_list (pipe is) in     
     List.iter  print_result (List.map (fun n -&gt; n*. 4.) l)
  )
;;
</PRE><!--TOC section Global and local definitions-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc34">3.2</A>&#XA0;&#XA0;Global and local definitions</H2><!--SEC END --><P>
As discussed in Section<A HREF="#sec:commonerrors">2.7</A>, global variables <EM>must</EM>
not be used in an <FONT COLOR=purple>OcamlP3l</FONT> pragram, as their value on different processing
nodes will be different and updates will have effect only on the node which
executes them. </P><P>On the other hand, we can have both local and global definitions which are
evaluated before or after specializing a procesing node. For instance we can
share the same file descriptor between all the processing nodes partecipating
to a skeleton.
See examples in Section&#XA0;<A HREF="#sec:farm">1.3.3</A>.
</P><!--TOC section Managing command line: <TT>option</TT>-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc35">3.3</A>&#XA0;&#XA0;Managing command line: <TT>option</TT></H2><!--SEC END --><P>
To be done
</P><!--TOC section Directing allocation: colors-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc36">3.4</A>&#XA0;&#XA0;Directing allocation: colors</H2><!--SEC END --><P>
To be done
</P><!--TOC section Mixing Unix processes with <FONT COLOR=purple>OcamlP3l</FONT>-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc37">3.5</A>&#XA0;&#XA0;Mixing Unix processes with <FONT COLOR=purple>OcamlP3l</FONT></H2><!--SEC END --><P><A NAME="sec:mixingUnix"></A>
The following code defines a farm in which each worker computes the square
over a stream of float. However, the function is computed by an external
command which is spawned and connected via stdin / stdout to the farm worker
in the seq skeleton. In the following code, care is taken to activate the
external process only once. <TT>spawned</TT> is <TT>true</TT>
only if the Unix process and connecting pipes descriptor are maintained in
variables <TT>cin</TT> and <TT>cout</TT>. Notice that pattern matching in
<TT>(* * *)</TT> is not exhaustive, because if process has been already spawned
<TT>cin</TT> and <TT>cout</TT> must contain a valid descriptor. 
</P><PRE CLASS="verbatim">let farm_worker _ = 
  let conto = ref 0 and spawned = ref false 
  and cin = ref None 
  and cout = ref None in
    (fun x -&gt; 
       conto := !conto + 1;
       if not !spawned then
  begin
    let (ic,oc) = Unix.open_process "./square" 
    in cin := Some ic; cout := Some oc; spawned:=true
  end;
       let Some ic, Some oc = (!cin , !cout) in (* * *)
  Printf.fprintf oc "%d\n" x; Pervasives.flush oc;
  let i = Scanf.fscanf ic "%d" (fun x -&gt; x) in 
    )
;;

let compute = parfun (fun () -&gt; farm(seq(farm_worker),4));;

let print_result x = print_int x; print_newline();;

pardo(fun () -&gt;
 let is = P3lstream.of_list [1;2;3;4;5;6;7;8;9] in
 let s' = compute is in P3lstream.iter print_result s';
     );;
</PRE><P>The important thing is remember to fflush data output in
the external commend source otherwise we block. The following is an example of
valid C definition for the square program.
</P><PRE CLASS="verbatim">#include &lt;stdio.h&gt;
#define TRUE 1

int main(void)
{
  int i;
  while (TRUE) {
      /* reading from standard input */
      scanf("%d",&amp;i);  
      /* writing on strandard output */
      printf("%d\n",i*i);
      /* fflushing buffers otherwise you block the output stream */
      fflush(NULL); 
  }
  exit(0);
}
</PRE><!--TOC chapter Implementing <FONT COLOR=purple>OcamlP3l</FONT>-->
<H1 CLASS="chapter"><!--SEC ANCHOR --><A NAME="htoc38">Chapter&#XA0;4</A>&#XA0;&#XA0;Implementing <FONT COLOR=purple>OcamlP3l</FONT></H1><!--SEC END --><P> <A NAME="cap:implementation"></A>
Now, let us point out the peculiar features relative to the
implementation of <FONT COLOR=purple>OcamlP3l</FONT>. First, we will discuss the mechanism
used to implement different processes onto different nodes, by
exploiting a particular form of &#X201C;closure communication&#X201D;. Then, we
will point out some details relative to the interprocess communication
layer and we will motivate the choice of the Unix sockets as the
<FONT COLOR=purple>OcamlP3l</FONT> communication layer. Finally, we will discuss some
details of the templates we used to implement the skeletons provided
by <FONT COLOR=purple>OcamlP3l</FONT>.</P><!--TOC section Closure passing as distributed higher order
parameterization-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc39">4.1</A>&#XA0;&#XA0;Closure passing as distributed higher order
parameterization</H2><!--SEC END --><P>A sequential implementation of an <FONT COLOR=purple>OcamlP3l</FONT> program is quite
easy to provide: just use a library <CODE>seqp3l.ml</CODE> which implements each
skelton with a valid <FONT COLOR=purple>Ocaml</FONT> sequential function.
The type safety is a direct consequence of the
fact that we are not
using here anything from outside the safe core of Ocaml.</P><P>Similarly, providing the graphical semantics poses no real
challenge.</P><P>But what about the parallel semantics? What is the right way to
implement such a thing? We must guarantee the type safety and ensure
that the runtime is reasonably small as to allow the verification of
its properties, which will become an important point in industrial
applications. Both points posed problems which we overcame during the
development of the system.</P><P>First of all, to ensure that the system is manageable and safe, we
immediately discarded the approach based on parsing the source file to
extract the code corresponding to each node of the network: this would
impose to use external tools to perform an analysis of the user code
which is difficult, error prone,
and whose semantics would have a very unclear status.</P><P>Instead, we choose to use an SPMD approach: all the nodes of the
network will run the <EM>same</EM> program (in a sense this is the
&#X201C;template process interpreter code&#X201D;, as we will see in while) ,
which will be the result of the compilation of the full user code, and
a control node<SUP><A NAME="text8" HREF="#note8">1</A></SUP> will dispatch to the rest of the
nodes in the network the parameterization information needed to
specialize it to the particular function it is really supposed to
perform (emitter, collector,
sequential node running a given function <I>f</I>, etc.).</P><P>In order to achieve this behavior, the control node performs the
following tasks:</P><UL CLASS="itemize"><LI CLASS="li-itemize">
executes the <TT>parfun</TT> 
skeleton expression definitions, which has as a consequence to
build a data structure describing all the parallel process networks. From this
data structure, we compute behind the scenes the configuration
information for each node in the process network.
</LI><LI CLASS="li-itemize">executes the <TT>pardo</TT> expression: this has the following
effect
<UL CLASS="itemize"><LI CLASS="li-itemize">
maps virtual nodes to the processor pool given on the command
line,
</LI><LI CLASS="li-itemize">initializes a socket connection with all the participating
nodes,
</LI><LI CLASS="li-itemize">gets the port addresses from each of them (a fixed port number &#X2014;<TT>p3lport</TT>&#X2014; or some dynamically generated number if more than one copy
run on the same machine),
</LI><LI CLASS="li-itemize">sends out to each node the addresses of its connected
neighbors (this step together with the previous two provides an
implementation of a centralized deadlock free algorithm to
interconnect the other nodes into the process network specified by
the skeleton expression),
</LI><LI CLASS="li-itemize">sends out to each node the specialization information that
consists of the <EM>function</EM> it must perform.
</LI></UL>
</LI></UL><P>This very last task requires a sophisticated operation: sending a
<EM>function</EM> (or a closure) over a communication channel. This is usually
not possible in traditional functional programming languages, since sending an
arbitrary function supposes that we are able to find on the receiving side the
code corresponding to the function name received <EM>or</EM> that we can
transfer executable code (a feature known as <EM>mobility</EM> today). Now,
mobility is necessary to send closures between arbitrary programs (since two
different programs have no reason to know each other's function code), but
<EM>not</EM> between two copies of the <EM>same</EM> program: in the latter case,
it suffices to send what essentially amounts to a code pointer. Starting from
version 1.06, <FONT COLOR=purple>Ocaml</FONT> contains a modified marshaling library, originary
designed for the <FONT COLOR=purple>OcamlP3l</FONT> system, that performs closure sending between
copies of the same program (this is checked by means of an MD5 signature of
the program code). The <TT>ocaml</TT> run time system takes care of dealing
with differences in endianness and word size between communicating machines,
as well as flattening tree-shaped data structures.</P><P>On the other side, all the other nodes simply wait for a connection to
come in from the root node, then send out the address of the port they
allocate to do further communication, wait for the list of neighbors
and for the specialization function, then simply perform it until termination.</P><P>To summarize, in the implementation the possibility of sending
closures allowed us to obtain a kind of higher order distributed
parameterization that kept the runtime code to a minimum size (the
source codes of the full system is less than twenty kilobytes).</P><!--TOC section Communication and process support-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc40">4.2</A>&#XA0;&#XA0;Communication and process support</H2><!--SEC END --><P>
As far as the general mechanism of closure passing is concerned, no
particular requirement/restrictions have been posed onto the physical
communication implementation. Even considering the fact that we need
to move data between the different processes making up the parallel
implementation of an <FONT COLOR=purple>OcamlP3l</FONT> program, we derived no particular
constraint onto the communication layer.</P><P>Thus, at the very beginning of the <FONT COLOR=purple>OcamlP3l</FONT> project, we faced the
problem of choosing a suitable communication system. We had as a goal
to come out with the maximum &#X201C;portability&#X201D; of <FONT COLOR=purple>OcamlP3l</FONT>.
Furthermore, we wanted to fully demonstrate the feasibility of
integrating the parallel skeleton world within a functional framework.
These two goals had priority over the classical &#X201C;efficiency and
performance&#X201D; goal one usually has to achieve when dealing with
parallelism.</P><P>The result is that we have adopted the plain Unix socket world as the
communication layer. This has some (very) positive consequences on
the overall <FONT COLOR=purple>OcamlP3l</FONT> design:
</P><UL CLASS="itemize"><LI CLASS="li-itemize">
the socket communication support is generally available on Unix/Linux and
Windows system 
</LI><LI CLASS="li-itemize">no particular customization of the support is needed to match
the <FONT COLOR=purple>OcamlP3l</FONT> features,
</LI><LI CLASS="li-itemize">the point-to-point, connection oriented, stream model provided by
Unix sockets is perfect to model data streams flowing around
between the processes belonging to the process network derived by
<FONT COLOR=purple>OcamlP3l</FONT> to implement the user skeleton code,
</LI><LI CLASS="li-itemize">last but not least, there was an existing and suitable
<TT>ocaml</TT> interface to Unix system calls, including those
relative to sockets.
</LI></UL><P>On the down side, the adoption of Unix sockets presents an evident
disadvantage which is the low performance achieved in communications
(a raw synchronization (i.e.  zero length data communication) takes
several milliseconds to be performed, even in those cases when the
data transmission media turns out to be free, i.e.  no collisions are
detected).</P><P>At the moment, we are considering to use in the next version of <FONT COLOR=purple>OcamlP3l</FONT> a
communication layer based on an optimized communication library such as MPI
[<A HREF="#mpi"><CITE>18</CITE></A>], as an efficient alternative to the socket communication layer,
which will be nevertheless retained for its ease of deployment, that makes it
attractive for programming courses.</P><P>Porting to MPI will require some modifications in the template code used within
<FONT COLOR=purple>OcamlP3l</FONT>, and will not necessarily completely solve the performance
problems of the socket communication layer when run on a network of
computers, where most MPI libraries are still implemented using
sockets, but will allow to target real multiprocessor machines where
MPI is efficiently implemented, without touching the user code. Also,
we will be able to delegate to the MPI system the administrative tasks
involved in copying and launching the programs on the different
machines.</P><P>As far as the process model is concerned, we felt happy with the Unix
one. All we need is a mechanism allowing an instance of the template
interpreter (the one specialized by using the closure passing
mechanism) to be run onto different workstations belonging to a local
area network. The Unix <TT>rsh</TT> mechanism matches this
requirement. Note that, as processes are generated and run on
different machines just at the beginning of the <FONT COLOR=purple>OcamlP3l</FONT> program
execution, any considerations about performance in <TT>rsh</TT>-ing
is irrelevant.</P><!--TOC section Template implementation-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc41">4.3</A>&#XA0;&#XA0;Template implementation</H2><!--SEC END --><P>
<FONT COLOR=purple>OcamlP3l</FONT> implements each skeleton appearing in the application
code by generating a proper instantiation of the corresponding
implementation template. In <FONT COLOR=purple>OcamlP3l</FONT>, a single implementation
template is provided for each one of the skeletons supported. The
implementation templates provided within the current prototype closely
resemble the ones discussed in the informal parallel semantics
section (Section&#XA0;<A HREF="#sec:skeletons">1.3</A>). Actually, only the
<I>reduce</I> template is slightly different, in that the tree
discussed in Section <A HREF="#sec:red">1.3.7</A> is actually implemented
by a process network similar to the one discussed for the
farm, where partially evaluated data is iteratively passed back from
the collector to the emitter process. We are currently studying a more
efficient mechanism based on a formal calculus for data distribution and
computation over dense arrays [<A HREF="#Zheng03"><CITE>17</CITE></A><CITE>, </CITE><A HREF="#hlpp03"><CITE>8</CITE></A><CITE>, </CITE><A HREF="#clss05"><CITE>7</CITE></A>].</P><P>Each template appearing in <FONT COLOR=purple>OcamlP3l</FONT>:
</P><UL CLASS="itemize"><LI CLASS="li-itemize">
<EM>is parametric in the parallelism degree exploited</EM> As an example
the farm template may accommodate any positive numbers of worker
processes. Currently, the programmer must specify this
parameter, which is the second parameter of a
<TT>farm(f,n)</TT> skeleton call.
</LI><LI CLASS="li-itemize"><EM>is parametric in the function computed as the body of the
skeleton</EM> For instance, the farm skeleton accepts as a parameter
the function that has to be computed by the worker processes. This
function must be a skeleton itself. Therefore, either it is a
<TT>seq</TT> skeleton call, or it
is a skeleton call modeling a parallel computation. In the
former case, the skeleton is implemented by a process network whose
workers just perform the sequential computation <TT>f</TT> denoted
by some <TT>seq(f)</TT>. In the latter case, each worker process is
itself a process network known by the emitter and collector
processes implementing the farm just as channels where data has to
be delivered/fetched.
</LI><LI CLASS="li-itemize"><EM>provides a set of process templates</EM> i.e.  parametric process
specifications that can be instantiated to get the real process
codes building out the implementation template process network. As
an example, consider again the farm template. The emitter process
behavior can be fully specified by the data type of items that have
to be processed, by the channel from which those data items have to
be read and by the set of channels onto which the data items have to
be scheduled (written) to the worker processes, possibly with some
&#X201C;clever&#X201D; (e.g.  achieving load balancing) scheduling strategy. Such
a process can be completely specified by providing a function
<DIV CLASS="center">
<TT>farmetempl (OutChanSel f) ic ocl</TT>
</DIV>
whose first parameter provides the worker scheduling
function, the second one provides the input channel where data has
to be fetched and the third one provides the set of channels used to
deliver tasks to be computed to the farm workers. The type of such a
function turns out to be
<PRE CLASS="verbatim">val farmetempl : ('a, 'b) Parp3l.action -&gt; in_channel -&gt;
                 out_channel list -&gt; unit
</PRE>
The process template definition in the <FONT COLOR=purple>OcamlP3l</FONT> code looks like
the following:
<PRE CLASS="verbatim">let farmetempl (OutChanSel f) ic ocl = 
  while true do
    try
      let theoc = f ocl in
      match (Marshal.from_channel ic) with
        UserPacket(p,seqn,tl) -&gt; 
          Marshal.to_channel 
            theoc 
            (UserPacket (p,seqn,Farmtag::tl))
            [Marshal.Closures]; 
          flush theoc;
      | EndStream  -&gt; 
          List.iter 
           (fun x -&gt; Marshal.to_channel 
                       x 
                       EndStream 
                       [Marshal.Closures]; 
                     flush x) 
           ocl; 
          List.iter close_out ocl; close_in ic; exit 0
    with End_of_file -&gt; List.iter close_out ocl; 
                        close_in ic
  done;;
</PRE>
</LI></UL><P>
Therefore the whole compilation process transforming an <FONT COLOR=purple>OcamlP3l</FONT>
skeleton program into the parallel process network implementing the
program can be summarized in the following steps:</P><OL CLASS="enumerate" type=1><LI CLASS="li-enumerate">
the skeleton code is parsed and transformed into a skeleton tree
data structure, recording all the significant details of the
skeleton nesting supplied by the user code,
</LI><LI CLASS="li-enumerate">the skeleton tree is traversed and processes are assigned to
each skeleton according to the implementation templates. During this
phase, processes are denoted by their input/output channels,
identified via a unique number.
</LI><LI CLASS="li-enumerate">once the number and the kind of parallel processes building out
the skeleton code implementation is known, code is generated that
either delivers the proper closures, derived by using the process
templates, to the &#X201C;template interpreter&#X201D; instances running on
distinct workstations (this happens just on one node, the &#X201C;root&#X201D;
one), or waits for a closure and repeatedly computes this closure on
the proper input and output channels until an <TT>EndOfFile</TT>
mark is received.
</LI></OL><!--BEGIN NOTES chapter-->
<HR CLASS="footnoterule"><DL CLASS="thefootnotes"><DT CLASS="dt-thefootnotes">
<A NAME="note8" HREF="#text8">1</A></DT><DD CLASS="dd-thefootnotes">The control node runs the same program as the
others, but it is invoked by the user with a special designating
option <TT>-p3lroot</TT>.
</DD></DL>
<!--END NOTES-->
<!--TOC chapter Multivariant semantics and logical debugging-->
<H1 CLASS="chapter"><!--SEC ANCHOR --><A NAME="htoc42">Chapter&#XA0;5</A>&#XA0;&#XA0;Multivariant semantics and logical debugging</H1><!--SEC END --><P>By providing modules that implement the three <FONT COLOR=purple>OcamlP3l</FONT> skeleton
semantics (the sequential one, the parallel one and the graphical
one), we allow the <FONT COLOR=purple>OcamlP3l</FONT> user to perform the following parallel
application development process:
</P><UL CLASS="itemize"><LI CLASS="li-itemize">
develop skeleton code modeling the application at hand. This
just requires a full understanding of the skeleton sequential
semantics and usually allows the user to reuse consistent portions
of existing applications written in plain <TT>ocaml</TT> or legacy in C, C++
etc. 
</LI><LI CLASS="li-itemize">test the functionality of the new application by supplying
relevant input data items and looking at the results computed using
the sequential skeleton semantics. In case of problems, the user may
run the sequential debugging tools to overcome problems.
</LI><LI CLASS="li-itemize">link the parallel skeleton semantics module and run the
application onto the workstation network. Provided that the
application was sequentially correct, no new errors should be found at
this step (we assume that the run time is guaranteed correct!). In practice,
a few errors can occur ussually related to wrong assumption on global
variables. If your code uses global variables updating them durin
gexecution, this will work OK in the sequential semantics as memory is
actually shared but not in the parallel version as update will be only seen
locally on the processing node.
</LI><LI CLASS="li-itemize">look at the performance results of running the application on
the number of processors available and possibly adjust the
significant performance parameters, such as the number of workers of
the <TT>farm</TT>, <TT>map</TT> and <TT>reduce</TT> and the color of each of
them. This is
actually the real problem in the development of an
<I>efficient</I> parallel application. Forthcoming versions of
<FONT COLOR=purple>OcamlP3l</FONT> will include analytical performance models for the
templates and these models will be used to automatically guess
colors and giude the complier in the compiler.
During performance tuning, the programmer may link the graphic semantic
skeleton module and look at the results of the program execution,
i.e.  at the resulting process graph, in order to understand where
bottlenecks are or which parts of the program must be further
decomposed using skeletons in order to get better performant
application code.
</LI></UL><P>Let us spend now some words concerning logical, sequential debugging
of <FONT COLOR=purple>OcamlP3l</FONT> applications.</P><P>A user developing an <FONT COLOR=purple>OcamlP3l</FONT> application may link the sequential
skeleton semantics module to his/her code and debug the application by
using the plain sequential debugging tools of <TT>ocaml</TT>. This
debugging activity can be performed on a single machine, provided the
machine supplies <TT>ocaml</TT>. Also, performance tuning can be directed
using standard sequential profiling tools such as <TT>gprof</TT>.</P><P>Once the application has been debugged, i.e.  the user perceives it
computes the expected results, he/she can compile the parallel version of the application by linking the parallel
skeleton semantics. As we guarantee that the implementation templates
for the different skeletons of <FONT COLOR=purple>OcamlP3l</FONT> are correct (deadlock
free, load-balanced, etc.) and as we guarantee that the process
transforming the skeleton code in the process code is correct, the
user does not need to perform explicit activities in order to check
that the results computed by the parallel code are correct.</P><P>In particular, the user does not need to check that all the processes
have been correctly scheduled for execution, or that the communication
channels have been set up properly between these processes, or that
data of type <TT>'a</TT> has been never delivered on channels
transmitting data of type <TT>'b</TT>. This is a very short list of
bad things that may affect the correct behavior of an explicitly
parallel program, indeed. The fact that the user is not required at
all to take them into account is one of the biggest pro's of the
functional skeleton approach.</P><!--TOC chapter Related work, conclusions and perspectives-->
<H1 CLASS="chapter"><!--SEC ANCHOR --><A NAME="htoc43">Chapter&#XA0;6</A>&#XA0;&#XA0;Related work, conclusions and perspectives</H1><!--SEC END --><!--TOC section Related work-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc44">6.1</A>&#XA0;&#XA0;Related work</H2><!--SEC END --><P>
Many researchers are currently working on skeletons and most of them
are building some kind of parallel implementation, but our work, as
far as we know, is unique in its combination of a fully functional
strongly typed language with the skeleton approach.</P><P>In particular, Darlington's group at Imperial College in
London is actively working on skeletons. They have explored the
problems relative to implementing a skeleton programming system, but
the approach taken uses an imperative language as the implementation
language. Currently there is a &#X201C;local&#X201D; prototype implementation but
no public domain implementation of their skeleton approach and they
seem deeply involved in the study of the data-parallel and
coordination aspects of skeletons.
[<A HREF="#darli-to-1"><CITE>13</CITE></A><CITE>, </CITE><A HREF="#darli-to-2"><CITE>12</CITE></A><CITE>, </CITE><A HREF="#Darlington1996"><CITE>1</CITE></A>]</P><P>A different approach relative to skeleton parallel programming within
a functional framework has been discussed by Bratvold
[<A HREF="#bratvold-thesis"><CITE>4</CITE></A>]. Bratvold takes into account plain ML
programs and looks for skeletons within them, compiling these
skeletons by using process networks that look like implementation
templates. However, both the final target language and the
implementation language are imperative. </P><P>Finally, S&#XE9;rot [<A HREF="#serot97"><CITE>23</CITE></A><CITE>, </CITE><A HREF="#parco02"><CITE>22</CITE></A>], presents an embedding of skeletons within
<TT>ocaml</TT> that seems to be close to our work, although independently
developed. The message passing is performed by interfacing the MPI library with
<TT>ocaml</TT>. The skeletons taken into account are different. She considers
data-parallel <TT>farm</TT>, roughly corresponding to our <TT>mapvector</TT>
skeleton, and two further skeletons, <TT>scm</TT> and
<TT>filt</TT>. <TT>filt</TT> is a plain filter skeleton, canceling data items
from a list, while <TT>scm</TT> (Split, Compute and Merge) looks like a map
skeleton working on lists with explicit, user defined,
decomposition/recomposition functions.</P><P>S&#XE9;rot's implementation of the skeletons within <TT>ocaml</TT> is quite different
from ours and only allows one skeleton at a time to be realized on the processor
networks, thus preventing skeleton composition (you cannot nest two <TT>scm</TT>
skeletons for example), and only allowing for a limited form of staging of the
parallel computation: you can perform an <TT>scm</TT>, then when this is
finished, you can reorganize your network and perform another <TT>scm</TT>.
This way, the mapping of virtual processors to real processors on the network is
a trivial task, and is done inside each skeleton at run-time instead of
beforehand in a specific pass like in <FONT COLOR=purple>OcamlP3l</FONT>. S&#XE9;rot implements the
skeletons included in his model by providing second order functions that
directly call MPI and realize an SPMD execution model.</P><P>As for the relevant effort done in the field of languages for mobile agents,
like for example&#XA0;[<A HREF="#KnabePhD"><CITE>16</CITE></A><CITE>, </CITE><A HREF="#FournetMaranget_Join_calculus_language"><CITE>15</CITE></A>], it
should be noted that they address quite a different kind of problems, but once
stable, these languages could form the basis of a next generation fully
fault-tolerant and dynamically load-balanced version of our system.</P><!--TOC section Conclusions and perspectives-->
<H2 CLASS="section"><!--SEC ANCHOR --><A NAME="htoc45">6.2</A>&#XA0;&#XA0;Conclusions and perspectives</H2><!--SEC END --><P>
Here we showed how a skeleton parallel programming modelcan be successfully
married with the functional 
programming environments such as the one provided by <FONT COLOR=purple>Ocaml</FONT>.</P><P>In particular, we discussed a powerful skeletal model, 
how skeletons can be embedded within <FONT COLOR=purple>Ocaml</FONT> as second order functions and how
modules implementing 
both the sequential and the parallel skeleton and discuss the typical
application development cycle in <FONT COLOR=purple>OcamlP3l</FONT>.
The whole process
preserves the strong typing properties of <TT>ocaml</TT>.</P><P>At the moment, the prototype <FONT COLOR=purple>OcamlP3l</FONT>  implementation is being tested as
described in this paper and is available from the <FONT COLOR=purple>OcamlP3l</FONT> project home Web
page </P><P><TT>http://www.dicosmo.org/ocamlp3l/</TT>.</P><P>In the near future we want first of all to include a more powerful MAP
skeleton working on dense arrays with an arbitrary number of
dimensions[<A HREF="#Zheng03"><CITE>17</CITE></A><CITE>, </CITE><A HREF="#hlpp03"><CITE>8</CITE></A><CITE>, </CITE><A HREF="#clss05"><CITE>7</CITE></A>]. This will call for a more
efficient communication layer, by using colective MPI communications
[<A HREF="#mpi"><CITE>18</CITE></A>] instead of the plainUnix socket library. At the same time, we
investigate the feasibility of porting the 
system on the ubiquitous Windows boxes, for didactical purposes. Finally,
we already developed a parallel numerical code [<A HREF="#clement04"><CITE>5</CITE></A>] and
plan to write some more significant parallel applications in order to fully
test the prototype.</P><!--TOC chapter References-->
<H1 CLASS="chapter"><!--SEC ANCHOR -->References</H1><!--SEC END --><DL CLASS="thebibliography"><DT CLASS="dt-thebibliography">
<A NAME="Darlington1996"><FONT COLOR=purple>[1]</FONT></A></DT><DD CLASS="dd-thebibliography">
P.&#XA0;Au, J.&#XA0;Darlington, M.&#XA0;Ghanem, Y.&#XA0;Guo, H.W. To, and J.&#XA0;Yang.
Co-ordinating heterogeneous parallel computation.
In L.&#XA0;Bouge, P.&#XA0;Fraigniaud, A.&#XA0;Mignotte, and Y.&#XA0;Robert, editors, <EM>Europar '96</EM>, pages 601&#X2013;614. Springer-Verlag, 1996.</DD><DT CLASS="dt-thebibliography"><A NAME="orlando-grosso"><FONT COLOR=purple>[2]</FONT></A></DT><DD CLASS="dd-thebibliography">
B.&#XA0;Bacci, M.&#XA0;Danelutto, S.&#XA0;Orlando, S.&#XA0;Pelagatti, and M.&#XA0;Vanneschi.
P<SUP>3</SUP>L: A Structured High level programming language and its
structured support.
<EM>Concurrency Practice and Experience</EM>, 7(3):225&#X2013;255, May 1995.</DD><DT CLASS="dt-thebibliography"><A NAME="bird1"><FONT COLOR=purple>[3]</FONT></A></DT><DD CLASS="dd-thebibliography">
R.&#XA0;S. Bird.
An introduction to the Theory of Lists.
In M.&#XA0;Broy, editor, <EM>Logic of programming and calculi of discrete
design</EM>, volume F36 of <EM>NATO ASI</EM>, pages 5&#X2013;42. Springer-Verlag, Berlin,
1987.</DD><DT CLASS="dt-thebibliography"><A NAME="bratvold-thesis"><FONT COLOR=purple>[4]</FONT></A></DT><DD CLASS="dd-thebibliography">
T.&#XA0;Bratvold.
<EM>Skeleton-Based Parallelisation of Functional Programs</EM>.
PhD thesis, Heriot-Watt University, 1994.</DD><DT CLASS="dt-thebibliography"><A NAME="clement04"><FONT COLOR=purple>[5]</FONT></A></DT><DD CLASS="dd-thebibliography">
Fran&#XE7;ois Cl&#XE9;ment, Roberto Di Cosmo, Zheng Li, Vincent Martin, Arnaud
Vodicka, and Pierre Weis.
Parallel programming with the system applications to numerical code
coupling.
Technical Report RR-5131, INRIA Roquencourt, 2004.</DD><DT CLASS="dt-thebibliography"><A NAME="cole-th"><FONT COLOR=purple>[6]</FONT></A></DT><DD CLASS="dd-thebibliography">
M.&#XA0;Cole.
<EM>Algorithmic Skeletons: Structured Management of Parallel
Computations</EM>.
Research Monographs in Parallel and Distributed Computing. Pitman,
1989.</DD><DT CLASS="dt-thebibliography"><A NAME="clss05"><FONT COLOR=purple>[7]</FONT></A></DT><DD CLASS="dd-thebibliography">
Roberto&#XA0;Di Cosmo, Zheng Li, and Susanna Pelagatti.
A calculus for parallel computations over multidimensional dense
arrays.
to appear on <EM>Computer Languages, Systems and Structures</EM>
special issue on &#X201C;Semantics and Cost models for High-Level Parallel
Programming&#X201D;, 2005.</DD><DT CLASS="dt-thebibliography"><A NAME="hlpp03"><FONT COLOR=purple>[8]</FONT></A></DT><DD CLASS="dd-thebibliography">
Roberto&#XA0;Di Cosmo and Susanna Pelagatti.
A cost calculus for dense array distributions.
<EM>Parallel Processing Letters</EM>, 13(3):377&#X2013;388, 2003.</DD><DT CLASS="dt-thebibliography"><A NAME="fgcs-firenze"><FONT COLOR=purple>[9]</FONT></A></DT><DD CLASS="dd-thebibliography">
M.&#XA0;Danelutto, R.&#XA0;Di Meglio, S.&#XA0;Orlando, S.&#XA0;Pelagatti, and M.&#XA0;Vanneschi.
A methodology for the development and support of massively parallel
programs.
<EM>Future Generation Computer Systems</EM>, 8(1&#X2013;3):205&#X2013;220, July
1992.</DD><DT CLASS="dt-thebibliography"><A NAME="Ocamlp3lMlw98"><FONT COLOR=purple>[10]</FONT></A></DT><DD CLASS="dd-thebibliography">
Marco Danelutto, Roberto Di&#XA0;Cosmo, Xavier Leroy, and Susanna Pelagatti.
Parallel functional programming with skeletons: the ocamlp3l
experiment.
<EM>The ML Workshop</EM>, 1998.</DD><DT CLASS="dt-thebibliography"><A NAME="ic-parle-93-1"><FONT COLOR=purple>[11]</FONT></A></DT><DD CLASS="dd-thebibliography">
J.&#XA0;Darlington, A.&#XA0;J. Field, P.&#XA0;G. Harrison, P.&#XA0;H.&#XA0;J. Kelly, D.&#XA0;W.&#XA0;N. Sharp, and
Q.&#XA0;Wu.
Parallel Programming Using Skeleton Functions.
In <EM>PARLE'93</EM>, pages 146&#X2013;160. Springer, 1993.
LNCS No. 694.</DD><DT CLASS="dt-thebibliography"><A NAME="darli-to-2"><FONT COLOR=purple>[12]</FONT></A></DT><DD CLASS="dd-thebibliography">
J.&#XA0;Darlington, Y.&#XA0;Guo, H.&#XA0;W. To, Q.&#XA0;Wu, J.&#XA0;Yang, and M.&#XA0;Kohler.
Fortran-S: A Uniform Functional Interface to Parallel Imperative
Languages.
In <EM>Third Parallel Computing Workshop (PCW'94)</EM>. Fujitsu
Laboratories Ltd., November 1994.</DD><DT CLASS="dt-thebibliography"><A NAME="darli-to-1"><FONT COLOR=purple>[13]</FONT></A></DT><DD CLASS="dd-thebibliography">
J.&#XA0;Darlington, Y.&#XA0;Guo, H.&#XA0;W. To, and J.&#XA0;Yang.
Parallel Skeletons for Structured Composition.
In <EM>Fifth ACM SIGPLAN Symposium on Principles and Practice of
Parallel Programming</EM>. ACM Press, July 1995.</DD><DT CLASS="dt-thebibliography"><A NAME="FournetGonthier96_rcham-join-calculus"><FONT COLOR=purple>[14]</FONT></A></DT><DD CLASS="dd-thebibliography">
C&#XE9;dric Fournet and Georges Gonthier.
The reflexive chemical abstract machine and the join-calculus.
In <EM>Proceedings of the 23rd ACM Symposium on Principles of
Programming Languages</EM>, pages 372&#X2013;385, St. Petersburg Beach, Florida,
January 21-24 1996. ACM.</DD><DT CLASS="dt-thebibliography"><A NAME="FournetMaranget_Join_calculus_language"><FONT COLOR=purple>[15]</FONT></A></DT><DD CLASS="dd-thebibliography">
Cedric Fournet and Luc Maranget.
<EM>The Join-Calculus language</EM>.
INRIA, June 1997.
Software and documentation available electronically, (<TT>http://pauillac.inria.fr/join</TT>).</DD><DT CLASS="dt-thebibliography"><A NAME="KnabePhD"><FONT COLOR=purple>[16]</FONT></A></DT><DD CLASS="dd-thebibliography">
F.&#XA0;C. Knabe.
<EM>Language Support for Mobile Agents</EM>.
PhD thesis, School of Computer Science, Carnegie Mellon University,
1995.
CMU-CS-95-223; also published as Technical Report ECRC-95-36.</DD><DT CLASS="dt-thebibliography"><A NAME="Zheng03"><FONT COLOR=purple>[17]</FONT></A></DT><DD CLASS="dd-thebibliography">
Zheng Li.
Efficient implementation of MAP skeleton for the OcamlP3L system.
DEA Report, Universit&#XE9; PARIS VII, July 2003.</DD><DT CLASS="dt-thebibliography"><A NAME="mpi"><FONT COLOR=purple>[18]</FONT></A></DT><DD CLASS="dd-thebibliography">
M.P.I.Forum.
Document for a standard message-passing interface.
Technical Report CS-93-214, University of Tennessee, November 1993.</DD><DT CLASS="dt-thebibliography"><A NAME="tesi-susanna"><FONT COLOR=purple>[19]</FONT></A></DT><DD CLASS="dd-thebibliography">
S.&#XA0;Pelagatti.
A methodology for the development and the support of massively
parallel programs.
Technical Report TD-11/93, Dept. of Computer Science &#X2013; Pisa, 1993.
PhD Thesis.</DD><DT CLASS="dt-thebibliography"><A NAME="libro-susi"><FONT COLOR=purple>[20]</FONT></A></DT><DD CLASS="dd-thebibliography">
S.&#XA0;Pelagatti.
<EM>Structured development of parallel programs</EM>.
Taylor&amp;Francis, London, 1998.</DD><DT CLASS="dt-thebibliography"><A NAME="skbook02"><FONT COLOR=purple>[21]</FONT></A></DT><DD CLASS="dd-thebibliography">
S.&#XA0;Pelagatti.
Task and data parallelism in P3L.
In Fethi&#XA0;A. Rabhi and Sergei Gorlatch, editors, <EM>Patterns and
Skeletons for Parallel and Distributed Computing</EM>, chapter&#XA0;6, pages 155&#X2013;186.
Springer-Verlag, London, 2002.</DD><DT CLASS="dt-thebibliography"><A NAME="parco02"><FONT COLOR=purple>[22]</FONT></A></DT><DD CLASS="dd-thebibliography">
J.&#XA0;S&#XE9;rot and D.&#XA0;Ginhac.
Skeletons for parallel image processing : an overview of the
SKiPPER project.
<EM>Parallel Computing</EM>, 28(12):1785&#X2013;1808, Dec 2002.</DD><DT CLASS="dt-thebibliography"><A NAME="serot97"><FONT COLOR=purple>[23]</FONT></A></DT><DD CLASS="dd-thebibliography">
Jocelyn S&#XE9;rot.
Embodying parallel functional skeletons: an experimental
implementation on top of MPI.
In <EM>Proceedings of the EuroPar 97</EM>. Springer Verlag, LNCS No.
1300, 1997.
Passau, Germany.</DD></DL><!--TOC chapter Index-->
<H1 CLASS="chapter"><!--SEC ANCHOR -->Index</H1><!--SEC END --><P></P><TABLE CELLSPACING=6 CELLPADDING=0><TR><TD VALIGN=top ALIGN=left><UL CLASS="indexenv"><LI CLASS="li-indexenv">
control
skeletons, <A HREF="#@default4">1.2</A>
<BR>
<BR>
</LI><LI CLASS="li-indexenv">data parallel skeletons, <A HREF="#@default3">1.2</A>
<BR>
<BR>
</LI><LI CLASS="li-indexenv">farm, <A HREF="#@default1">1.2</A>
<BR>
<BR>
</LI><LI CLASS="li-indexenv">global initialization<UL CLASS="indexenv"><LI CLASS="li-indexenv">
example: magic
number, <A HREF="#@default8">1.3.3</A>
</LI></UL>
<BR>
<BR>
</LI><LI CLASS="li-indexenv">initialization<UL CLASS="indexenv"><LI CLASS="li-indexenv">
local, <A HREF="#@default7">1.2.3</A>
</LI></UL>
</LI></UL></TD><TD VALIGN=top ALIGN=left><UL CLASS="indexenv"><LI CLASS="li-indexenv">local initialization, <A HREF="#@default6">1.2.3</A>
<UL CLASS="indexenv"><LI CLASS="li-indexenv">
example: magic number, <A HREF="#@default9">1.3.3</A>
</LI></UL>
<BR>
<BR>
</LI><LI CLASS="li-indexenv">pipeline, <A HREF="#@default0">1.2</A>
<BR>
<BR>
</LI><LI CLASS="li-indexenv">service skeletons, <A HREF="#@default5">1.2</A>
<BR>
<BR>
</LI><LI CLASS="li-indexenv">task parallel skeletons, <A HREF="#@default2">1.2</A>
</LI></UL></TD></TR>
</TABLE><!--CUT END -->
<!--BEGIN STICKYNOTES document-->
<HR CLASS="footnoterule"><DL CLASS="thefootnotes"><DT CLASS="dt-thefootnotes">
<A NAME="note1" HREF="#text1">1</A></DT><DD CLASS="dd-thefootnotes">University of Paris 7 - France
</DD><DT CLASS="dt-thefootnotes"><A NAME="note2" HREF="#text2">2</A></DT><DD CLASS="dd-thefootnotes">Dipartimento di Informatica - University of Pisa - Italy
</DD><DT CLASS="dt-thefootnotes"><A NAME="note3" HREF="#text3">3</A></DT><DD CLASS="dd-thefootnotes">INRIA Rocquencourt - France
</DD></DL>
<!--END NOTES-->
<!--HTMLFOOT-->
<!--ENDHTML-->
<!--FOOTER-->
<HR SIZE=2><BLOCKQUOTE CLASS="quote"><EM>This document was translated from L<sup>A</sup>T<sub>E</sub>X by
</EM><A HREF="http://hevea.inria.fr/index.html"><EM>H</EM><EM><FONT SIZE=2><sup>E</sup></FONT></EM><EM>V</EM><EM><FONT SIZE=2><sup>E</sup></FONT></EM><EM>A</EM></A><EM>.</EM></BLOCKQUOTE></BODY>
</HTML>