Content - 654b153fd75b9357b6e8943fd2b3c3ec36b5ab17 - 24ed7cf/paper.tex

visit type:
Tip revision: 7f2d535dfc1f51d4b540863739e9993b7cbac7dc authored by Boud Roukema on 24 May 2020, 15:17:47 UTC
Sentence with the clerk who can do it
Tip revision: 7f2d535
paper.tex
%% Main LaTeX source of project's paper, license is printed in the end.
%
%% Copyright (C) 2020 Mohammad Akhlaghi <mohammad@akhlaghi.org>
%% Copyright (C) 2020 Raúl Infante-Saiz <infantesainz@gmail.com>
%% Copyright (C) 2020 Boudewijn F. Roukema <boud@astro.uni.torun.pl>
%% Copyright (C) 2020 David Valls-Gabaud <david.valls-gabaud@obspm.fr>
%% Copyright (C) 2020 Roberto Baena-Gallé <roberto.baena@gmail.com>
\documentclass[journal]{IEEEtran}

%% This is a convenience variable if you are using PGFPlots to build plots
%% within LaTeX. If you want to import PDF files for figures directly, you
%% can use the standard `\includegraphics' command. See the definition of
%% `\includetikz' in `tex/preamble-pgfplots.tex' for where the files are
%% assumed to be if you use `\includetikz' when `\makepdf' is not defined.
\newcommand{\makepdf}{}

%% When defined (value is irrelevant), `\highlightchanges' will cause text
%% in `\tonote' and `\new' to become colored. This is useful in cases that
%% you need to distribute drafts that is undergoing revision and you want
%% to highlight to your colleagues which parts are new and which parts are
%% only for discussion.
%\newcommand{\highlightchanges}{}

%% Import necessary packages
\input{tex/build/macros/project.tex}
\input{tex/src/preamble-project.tex}
\input{tex/src/preamble-pgfplots.tex}

%% Title and author names.
\title{Towards Long-term and Archivable Reproducibility}
\author{
  Mohammad~Akhlaghi,
  Ra\'ul Infante-Sainz,
  Boudewijn F. Roukema,
  David Valls-Gabaud,
  Roberto Baena-Gall\'e
  \thanks{Manuscript received MM DD, YYYY; revised MM DD, YYYY.}
}

%% The paper headers
\markboth{Computing in Science and Engineering, Vol. X, No. X, MM YYYY}%
{Akhlaghi \MakeLowercase{\textit{et al.}}: Towards Long-term and Archivable Reproducibility}










%% Start the paper.
\begin{document}

% make the title area
\maketitle

% As a general rule, do not put math, special symbols or citations
% in the abstract or keywords.
\begin{abstract}
  %% CONTEXT
  Reproducible workflow solutions commonly use the high-level technologies that were popular when they were created, providing an immediate solution that is unlikely to be sustainable in the long term.
  %% AIM
  We aim to introduce a set of criteria to address this problem and to demonstrate their practicality.
  %% METHOD
  The criteria have been tested in several research publications and can be summarized as: completeness (no dependency beyond a POSIX-compatible operating system, no administrator privileges, no network connection and storage primarily in plain text); modular design; linking analysis with narrative; temporal provenance; scalability; and free-and-open-source software.
  %% RESULTS
  Through an implementation, called ``Maneage'', we find that storing the project in machine-actionable and human-readable plain-text, enables version-control, cheap archiving, automatic parsing to extract data provenance, and peer-reviewable verification.
  Furthermore, we find that these criteria are not limited to long-term reproducibility, but also provide immediate benefits for short-term reproducibility.
  %% CONCLUSION
  We conclude that requiring longevity of a reproducible workflow solution is realistic.
  We discuss the benefits of these criteria for scientific progress.
\end{abstract}

% Note that keywords are not normally used for peerreview papers.
\begin{IEEEkeywords}
Data Lineage, Provenance, Reproducibility, Scientific Pipelines, Workflows
\end{IEEEkeywords}






% For peer review papers, you can put extra information on the cover
% page as needed:
% \ifCLASSOPTIONpeerreview
% \begin{center} \bfseries EDICS Category: 3-BBND \end{center}
% \fi
%
% For peerreview papers, this IEEEtran command inserts a page break and
% creates the second title. It will be ignored for other modes.
\IEEEpeerreviewmaketitle



\section{Introduction}
% The very first letter is a 2 line initial drop letter followed
% by the rest of the first word in caps.
%\IEEEPARstart{F}{irst} word

Reproducible research has been discussed in the sciences for at least 30 years \cite{claerbout1992, fineberg19}.
Many reproducible workflow solutions (hereafter, ``solution(s)'') have been proposed, mostly relying on the common technology of the day: starting with Make and Matlab libraries in the 1990s, Java in the 2000s and mostly shifting to Python during the last decade.
Recently, controlling the environment has been facilitated through generic package managers (PMs) and containers.

However, because of their high-level nature, such third-party tools for the workflow (not the analysis) develop very fast, e.g., Python 2 code often cannot run with Python 3, interrupting many projects.
Containers (in custom binary formats) are also being heavily used, but are large (Gigabytes) and expensive to archive.
Moreover, once the binary format is obsolete, reading or parsing the project becomes impossible.

The cost of staying up to date within this rapidly evolving landscape is high.
Scientific projects, in particular, suffer the most: scientists have to focus on their own research domain, but to some degree they need to understand the technology of their tools, because it determines their results and interpretations.
Decades later, scientists are still held accountable for their results.
Hence, the evolving technology landscape creates generational gaps in the scientific community, preventing previous generations from sharing valuable lessons which are too low-level to be published in a traditional scientific paper.
As a solution to this problem, here we introduce a set of criteria that can guarantee the longevity of a project based on our experience with existing solutions.





\section{Commonly used tools and their longevity}
To highlight the necessity of longevity, some of the most commonly used tools are reviewed here, from the perspective of long-term usability.
While longevity is important in science and some fields of industry, this isn't always the case, e.g., fast-evolving tools can be appropriate in short-term commercial projects.
Most existing reproducible workflows use a common set of third-party tools that can be categorized as:
(1) environment isolators -- virtual machines (VMs) or containers;
(2) PMs -- Conda, Nix, or Spack;
(3) job management -- shell scripts, Make, SCons, or CGAT-core;
(4) notebooks -- such as Jupyter.

To isolate the environment, VMs have sometimes been used, e.g., in \href{https://is.ieis.tue.nl/staff/pvgorp/share}{SHARE} (which was awarded second prize in the Elsevier Executable Paper Grand Challenge of 2011 but discontinued in 2019).
However, containers (in particular, Docker, and to a lesser degree, Singularity) are by far the most widely used solution today, we will thus focus on Docker here.

Ideally, it is possible to precisely identify the images that are imported into a Docker container by their checksums.
But that is rarely practiced in most solutions that we have studied.
Usually, images are imported with generic operating system names e.g. \cite{mesnard20} uses `\inlinecode{FROM ubuntu:16.04}'.
The extracted tarball (from \url{https://partner-images.canonical.com/core/xenial}) is updated with different software versions almost monthly and only archives the most recent five images.
If the Dockerfile is run in different months, it will contain different core operating system components.
In the year 2024, when long-term support for this version of Ubuntu expires, the image will be unavailable at the expected URL.
This is similar for other OSes: pre-built binary files are large and expensive to maintain and archive.
Furthermore, Docker requires root permissions, and only supports recent (``long-term-support'') versions of the host kernel, so older Docker images may not be executable.

Once the host OS is ready, PMs are used to install the software, or environment.
Usually the OS's PM, like `\inlinecode{apt}' or `\inlinecode{yum}', is used first and higher-level software are built with more generic PMs like Conda, Nix, GNU Guix or Spack.
The OS PM suffers from the same longevity problem as the OS.
Some third-party tools like Conda and Spack are written in high-level languages like Python, so the PM itself depends on the host's Python installation.
Nix and GNU Guix do not have any dependencies and produce bit-wise identical programs, but they need root permissions.
Generally, the exact version of each software's dependencies is not precisely identified in the build instructions (although that could be implemented).
Therefore, unless precise version identifiers of \emph{every software package} are stored, a PM will use the most recent version.
Furthermore, because each third-party PM introduces its own language and framework, this increases the project's complexity.

With the software environment built, job management is the next component of a workflow.
Visual workflow tools like Apache Taverna, GenePattern, Kepler or VisTrails (mostly introduced in the 2000s and using Java) encourage modularity and robust job management, but the more recent tools (mostly in Python) leave this to project authors.
Designing a modular project needs to be encouraged and facilitated because scientists (who are not usually trained in data management) will rarely apply best practices in project management and data carpentry.
This includes automatic verification: while it is possible in many solutions, it is rarely practiced, which leads to many inefficiencies in project cost and/or scientific accuracy (reusing, expanding or validating will be expensive).

Finally, to add narrative, computational notebooks\cite{rule18}, like Jupyter, are being increasingly used in many solutions.
However, the complex dependency trees of such web-based tools make them very vulnerable to the passage of time, e.g., see Figure 1 of \cite{alliez19} for the dependencies of Matplotlib, one of the simpler Jupyter dependencies.
The longevity of a project is determined by its shortest-lived dependency.
Furthermore, as with job management, computational notebooks don't actively encourage good practices in programming or project management.
Hence they can rarely deliver their promised potential\cite{rule18} and can even hamper reproducibility \cite{pimentel19}.

An exceptional solution we encountered was the Image Processing Online Journal (IPOL, \href{https://www.ipol.im}{ipol.im}).
Submitted papers must be accompanied by an ISO C implementation of their algorithm (which is buildable on any widely used operating system) with example images/data that can also be executed on their webpage.
This is possible due to the focus on low-level algorithms that do not need any dependencies beyond an ISO C compiler.
Many data-intensive projects commonly involve dozens of high-level dependencies, with large and complex data formats and analysis, so this solution is not scalable.





\section{Proposed criteria for longevity}

The main premise is that starting a project with a robust data management strategy (or tools that provide it) is much more effective, for researchers and the community, than imposing it in the end \cite{austin17,fineberg19}.
Researchers play a critical role\cite{austin17} in making their research more Findable, Accessible, Interoperable, and Reusable (the FAIR principles).
Simply archiving a project workflow in a repository after the project is finished is, on its own, insufficient, and maintaining it by repository staff is often either practically infeasible or unscalable.
In this paper we argue that workflows satisfying the criteria below can improve researcher workflows during the project, reduce the cost of curation for repositories after publication, while maximizing the FAIRness of the deliverables for future researchers.

\textbf{Criterion 1: Completeness.}
A project that is complete (self-contained) has the following properties.
(1) It has no dependency beyond the Portable Operating System (OS) Interface: POSIX.
IEEE defined POSIX (a minimal Unix-like environment) and many OSes have complied.
It is a reliable foundation for longevity in software execution.
(2) ``No dependency'' requires that the project itself must be primarily stored in plain text, not needing specialized software to open, parse or execute.
(3) It does not affect the host OS (its libraries, programs, or environment).
(4) It does not require root or administrator privileges.
(5) It builds its own controlled software for an independent environment.
(6) It can run locally (without an internet connection).
(7) It contains the full project's analysis, visualization \emph{and} narrative: from access to raw inputs to doing the analysis, producing final data products \emph{and} its final published report with figures, e.g., PDF or HTML.
(8) It can run automatically, with no human interaction.

\textbf{Criterion 2: Modularity.}
A modular project enables and encourages the analysis to be broken into independent modules with well-defined inputs/outputs and minimal side effects.
Explicit communication between various modules enables optimizations on many levels:
(1) Execution in parallel and avoiding redundancies (when a dependency of a module has not changed, it will not be re-run).
(2) Usage in other projects.
(3) Easy debugging and improvements.
(4) Modular citation of specific parts.
(5) Provenance extraction.

\textbf{Criterion 3: Minimal complexity.}
Minimal complexity can be interpreted as:
(1) Avoiding the language or framework that is currently in vogue (for the workflow, not necessarily the high-level analysis).
A popular framework typically falls out of fashion and requires significant resources to translate or rewrite every few years.
More stable/basic tools can be used with less long-term maintenance.
(2) Avoiding too many different languages and frameworks, e.g., when the workflow's PM and analysis are orchestrated in the same framework, it becomes easier to adopt and encourages good practices.

\textbf{Criterion 4: Scalability.}
A scalable project can easily be used in arbitrarily large and/or complex projects.
On a small scale, the criteria here are trivial to implement, but as the projects get more complex, an implementation can become unsustainable.

\textbf{Criterion 5: Verifiable inputs and outputs.}
The project should verify its inputs (software source code and data) \emph{and} outputs.
Reproduction should be straightforward enough such that ``\emph{a clerk can do it}''\cite{claerbout1992}, without requiring expert knowledge.

\textbf{Criterion 6: History and temporal provenance.}
No exploratory research project is done in a single/first attempt.
Projects evolve as they are being completed.
It is natural that earlier phases of a project are redesigned/optimized only after later phases have been completed.
These types of research papers often report this with statements like ``\emph{we [first] tried method [or parameter] X, but Y is used here because it gave lower random error}''.
The ``history'' is thus as valuable as the final/published version.

\textbf{Criterion 7: Including narrative, linked to analysis.}
A project is not just its computational analysis.
A raw plot, figure or table is hardly meaningful alone, even when accompanied by the code that generated it.
A narrative description is also part of the deliverables (defined as ``data article'' in \cite{austin17}): describing the purpose of the computations, and interpretations of the result, and the context in relation to other projects/papers.
This is related to longevity, because if a workflow only contains the steps to do the analysis or generate the plots, it may become separated from its accompanying published paper in time due to the different hosts.

\textbf{Criterion 8: Free and open source software:}
Technically, reproducibility (as defined in \cite{fineberg19}) is possible with non-free or non-open-source software (a black box).
This criterion is necessary to complement that definition (nature is already a black box).
If a project is free software (as formally defined), then others can learn from, modify, and build on it.
When the software used by the project is itself also free:
(1) The lineage can be traced to the implemented algorithms, possibly enabling optimizations on that level.
(2) The source can be modified to work on future hardware.
In contrast, a non-free software package typically cannot be distributed by others, making it reliant on a single server (even without payments).










\section{Proof of concept: Maneage}

Given the limitations of existing tools with the proposed criteria, it is necessary to show a proof of concept.
The proof presented here has already been tested in previously published papers \cite{akhlaghi19, infante20} and was recently awarded a Research Data Alliance (RDA) adoption grant for implementing the recommendations of the joint RDA and World Data System (WDS) working group on Publishing Data Workflows\cite{austin17}, from the researcher perspective to ensure longevity.

The proof of concept is called Maneage, for \emph{Man}aging data Lin\emph{eage} (ending is pronounced like ``Lineage'').
It was developed along with the criteria, as a parallel research project in 5 years for publishing our reproducible workflows to supplement our research.
Its primordial form was implemented in \cite{akhlaghi15} and later evolved in \href{http://doi.org/10.5281/zenodo.1163746}{zenodo.1163746} and \href{http://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}.

Technically, the hardest criteria to implement was the completeness criteria (and in particular no dependency beyond POSIX), blended with minimal complexity.
One proposed solution was the Guix Workflow Language (GWL) which is written in the same framework (GNU Guile, an implementation of Scheme) as GNU Guix (a PM).
But as natural scientists (astronomers), our background was with languages like Shell, Python, C or Fortran.
Not having any exposure to Lisp/Scheme and their fundamentally different style, made it very hard for us to adopt GWL.
Furthermore, the desired solution was meant to be easily understandable/usable by fellow scientists, who generally have not had exposure to Lisp/Scheme.

Inspired by GWL+Guix, a single job management tool was used for both installing of software \emph{and} the analysis workflow: Make.
Make is not an analysis language, it is a job manager, deciding when to call analysis programs (in any language like Python, R, Julia, Shell or C).
Make is standardized in POSIX and is used in almost all core OS components.
It is thus mature, actively maintained and highly optimized.
Make was recommended by the pioneers of reproducible research\cite{claerbout1992,schwab2000} and many researchers have already had a minimal exposure to it (when building research software).
%However, because they didn't attempt to build the software environment, in 2006 they moved to SCons (Make-simulator in Python which also attempts to manage software dependencies) in a project called Madagascar (\url{http://ahay.org}), which is highly tailored to Geophysics.

Linking the analysis and narrative was another major design choice.
Literate programming, implemented as Computational Notebooks like Jupyter, is a common solution these days.
However, due to the problems above, our implementation follows a more abstract design: providing a more direct and precise, but modular (not in the same file) connection.

Assuming that the narrative is typeset in \LaTeX{}, the connection between the analysis and narrative (usually as numbers) is through \LaTeX{} macros, that are automatically defined during the analysis.
For example, in the abstract of \cite{akhlaghi19} we say `\emph{... detect the outer wings of M51 down to S/N of 0.25 ...}'.
The \LaTeX{} source of the quote above is: `\inlinecode{\small detect the outer wings of M51 down to S/N of \$\textbackslash{}demo\-sf\-optimized\-sn\$}'.
The macro `\inlinecode{\small\textbackslash{}demosfoptimizedsn}' is set during the analysis, and expands to the value `\inlinecode{0.25}' when the PDF output is built.
Such values also depend on the analysis, hence just as plots, figures or tables they should also be reproduced.
As a side-effect, these macros act as a quantifiable link between the narrative and analysis, with the granularity of a word in a sentence and exact analysis command.
This allows accurate provenance \emph{and} automatic updates to the text when necessary.
Manually typing such numbers in the narrative is prone to errors and discourages experimentation after writing the first draft.

The ultimate aim of any project is to produce a report accompanying a dataset with some visualizations, or a research article in a journal.
Let's call it \inlinecode{paper.pdf}.
Hence the files hosting the macros (that go into the report) of each analysis step, build the core structure (skeleton) of Maneage.
During the software building (``configuration'') phase, each software is identified by a \LaTeX{} file, containing its official name, version and possible citation.
In the end, they are combined for precise software acknowledgment and citation (see the appendices of \cite{akhlaghi19, infante20}, not included here due to the strict word-limit).
Simultaneously, these files act as Make \emph{targets} and \emph{prerequisite}s to allow accurate dependency tracking and optimized execution (parallel, no redundancies), for any complexity (e.g., Maneage also builds Matplotlib if requested, see Figure 1 of \cite{alliez19}).
Software dependencies are built down to precise versions of the shell, POSIX tools (e.g., GNU Coreutils), \TeX{}Live, C compiler, and the C library (task 15390) for an exactly reproducible environment.
For fast relocation of the project (without building from source) it is possible to build it in the popular container, or VM, technology of the day.

In building of software, only the very high-level choice of which software to built differs between projects and the build recipes of each software do not generally change.
However, the analysis will naturally be different from one project to another.
Therefore, a design was necessary to satisfy the modularity, scalability and minimal complexity criteria while being generic enough to host any project.
To avoid getting too abstract, we will demonstrate it by replicating Figure 1C of \cite{menke20} in Figure \ref{fig:datalineage} (top).
Figure \ref{fig:datalineage} (bottom) is the data lineage graph that produced it (including this complete paper).

\begin{figure*}[t]
  \begin{center}
    \includetikz{figure-tools-per-year}
    \includetikz{figure-data-lineage}
  \end{center}
  \vspace{-3mm}
  \caption{\label{fig:datalineage}
    Top: an enhanced replica of figure 1C in \cite{menke20}, shown here for demonstrating Maneage.
    It shows the ratio of papers mentioning software tools (green line, left vertical axis) to total number of papers studied in that year (light red bars, right vertical axis in log-scale).
    Bottom: Schematic representation of the data lineage, or workflow, to generate the plot above.
    Each colored box is a file in the project and the arrows show the dependencies between them.
    Green files/boxes are plain-text files that are under version control and in the project source directory.
    Blue files/boxes are output files in the build-directory, shown within the Makefile (\inlinecode{*.mk}) where they are defined as a \emph{target}.
    For example, \inlinecode{paper.pdf} depends on \inlinecode{project.tex} (in the build directory; generated automatically) and \inlinecode{paper.tex} (in the source directory; written manually).
    The solid arrows and full-opacity built boxes are included with this paper's source.
    The dashed arrows and low-opacity built boxes show the scalability by adding hypothetical steps to the project.
  }
\end{figure*}

Analysis is orchestrated in a single point of entry (\inlinecode{top-make.mk}, which is a Makefile).
It is only responsible for \inlinecode{include}-ing the modular \emph{subMakefiles} of the analysis, in the desired order, not doing any analysis itself.
This is shown in Figure \ref{fig:datalineage} (bottom) where all the built/blue files are placed over subMakefiles.
A random reader will be able to understand the high-level logic of the project (irrespective of the low-level implementation details) with simple visual inspection of this file, provided that the subMakefile names are descriptive.
A human-friendly design (that is also optimized for execution) is a critical component of publishing reproducible workflows.

In all projects \inlinecode{top-make.mk} first loads \inlinecode{initialize.mk} and \inlinecode{download.mk} and finish with \inlinecode{verify.mk} and \inlinecode{paper.mk}.
Project authors add their modular subMakefiles in between (after \inlinecode{download.mk} and before \inlinecode{verify.mk}), in Figure \ref{fig:datalineage} (bottom), the project-specific subMakefiles are \inlinecode{format.mk} \& \inlinecode{demo-plot.mk}.
Except for \inlinecode{paper.mk} (which builds the ultimate target \inlinecode{paper.pdf}) all subMakefiles build a \LaTeX{} macro file with the same base-name (a \inlinecode{.tex} in each subMakefile of Figure \ref{fig:datalineage}).
Other built files ultimately cascade down in the lineage (through other files) to one of these macro files.

Irrespective of the number of subMakefiles, just before reaching the ultimate target (\inlinecode{paper.pdf}), the lineage reaches a bottleneck in \inlinecode{verify.mk}, to satisfy the verification criteria.
All the macro files, plot information and published datasets of the project are verified with their checksums here to automatically ensure exact reproducibility.
Where exact reproducibility is not possible, values can be verified by any statistical means (specified by the project authors).
We note that this step was not yet implemented in \cite{akhlaghi19, infante20}.

\begin{figure*}[t]
  \begin{center} \includetikz{figure-branching}\end{center}
  \vspace{-3mm}
  \caption{\label{fig:branching} Maneage is a Git branch, projects using Maneage are branched-off of it and apply their customizations.
    (a) shows a hypothetical project's history prior to publication.
    The low-level structure (in Maneage, shared between all projects) can be updated by merging with Maneage.
    (b) shows how a finished/published project can be revitalized for new technologies simply by merging with the core branch.
    Each Git ``commit'' is shown on its branch as a colored ellipse, with their hash printed in them.
    The commits are colored based on their branch.
    The collaboration and two paper icons are respectively made by `mynamepong' and `iconixar' from \url{www.flaticon.com}.
  }
\end{figure*}

To further minimize complexity, the low-level implementation can be further separated from the high-level execution through configuration files.
By convention in Maneage, the subMakefiles, and the programs they call for number-crunching, do not contain any fixed numbers, settings or parameters.
Parameters are set as Make variables in ``configuration files'' (with a \inlinecode{.conf} suffix) and passed to the respective program.
For example, in Figure \ref{fig:datalineage}, \inlinecode{INPUTS.conf} contains URLs and checksums for all imported datasets, enabling exact verification before usage.
As another demo, we report that \cite{menke20} studied $\menkenumpapersdemocount$ papers in $\menkenumpapersdemoyear$ (which is not in their original plot).
The number \inlinecode{\menkenumpapersdemoyear} is stored in \inlinecode{demo-year.conf}.
As the lineage shows, the result (\inlinecode{\menkenumpapersdemocount}) was calculated after generating \inlinecode{columns.txt}.
Both are expanded in this PDF as \LaTeX{} macros.
This enables the reader to change the value in \inlinecode{demo-year.conf} to automatically update the result, without necessarily knowing how it was generated.
Furthermore, the configuration files are a prerequisite of the targets that use them.
Hence if changed, Make will \emph{only} re-execute the dependent recipe and all its descendants with no modification to the project's source or other built products.
This fast/cheap testing encourages experimentation (without necessarily knowing the implementation details, e.g., by co-authors or future readers), and ensures self-consistency.

Finally, to satisfy the temporal provenance criteria, version control (currently implemented in Git), plays a defining role in Maneage as shown in Figure \ref{fig:branching}.
In practice, Maneage is a Git branch that contains the shared components, or infrastructure of all projects (e.g., software tarball URLs, build recipes, common subMakefiles and interface script).
Every project starts by branching-off the Maneage branch and customizing it (e.g., adding their own title, input data links, writing their narrative, and subMakefiles for their analysis), see Listing \ref{code:branching}.

\begin{lstlisting}[
    label=code:branching,
    caption={Starting a new project with Maneage, and building it},
  ]
# Cloning main Maneage branch and branching-off of it.
$ git clone https://git.maneage.org/project.git
$ cd project
$ git remote rename origin origin-maneage
$ git checkout -b master

# Build the project in two phases:
$ ./project configure    # Build software environment.
$ ./project make         # Do analysis, build PDF paper.
\end{lstlisting}

As Figure \ref{fig:branching} shows, due to this architecture, it is always possible to import or merge Maneage into the project to improve the low-level infrastructure:
in (a) the authors merge into Maneage during an ongoing project,
in (b) readers can do it after the paper's publication, e.g., when the project's infrastructure is outdated, or does not build, and authors cannot be accessed.
Low-level improvements in Maneage are thus automatically propagated to all projects.
This greatly reduces the cost of curation, or maintenance, of each individual project, before \emph{and} after publication.






\section{Discussion}

%% It should provide some insight or lessons learned.
%% What is the message we should take from the experience?
%% Are there clear demonstrated design principles that can be reapplied elsewhere?
%% Are there roadblocks or bottlenecks that others might avoid?
%% Are there suggested community or work practices that can make things smoother?
%% Attempt to generalise the significance.
%% should not just present a solution or an enquiry into a unitary problem but make an effort to demonstrate wider significance and application and say something more about the ‘science of data’ more generally.

Having shown that it is possible to build workflows satisfing the proposed criteria, here we will review the lessons leaned and insights gained, while sharing the experience of implementing the RDA/WDS recommendations.
We will also discuss the design principles, an how they may be generalized and usable in other projects.
In particular, with the support of RDA, the user base and development of the criteria and Maneage grew phenomenally, highlighting some difficulties for the wide-spread adoption of these criteria.

Firstly, while most researachers are generally familiar with them, the necessary low-level tools (e.g., Git, \LaTeX, the command-line and Make) are not widely used.
But we have noticed that after witnessing the improvements in their research, many (especially early career researchers) have started mastering these tools.
Scientists are rarely trained sufficiently in data management or software development, and the plethora of high-level tools that change every few years, discourages them.
Fast evolving tools are primarily targeted at software developers, who are paid to learn them and use them effectively for short-term projects and move to the next technology.
Scientists, on the other hand, need to focus on their own research fields, and need to consider longevity.
Hence, arguably the most important feature of these criteria is that they provide a fully working template, using mature and time-tested tools, for blending version control, paper's narrative, software management \emph{and} a modular lineage for analysis.
We have seen that a complete \emph{and} customizable template with a clear checklist of first steps is much more effective in encouraging mastery of these essential tools for modern science.
As opposed to having abstract/isolated tutorials on each tool individually.

Secondly, to satisfy the completeness criteria, all the necessary software of the project must be built on various POSIX-compatible systems (we actively test Maneage on several GNU/Linux distributions and macOS).
This requires maintenance by our core team and consumes time and energy.
However, the PM and analysis share the same job manager and our experience so far has shown that users' experience in the analysis, empowers some of them to add/fix their required software on their own systems.
Later, they share them as commits on the core branch, thus propagating it to all derived projects.
This has already occurred multiple times.

Thirdly, publishing a project's reproducible data lineage immediately after publication enables others to continue with follow-up papers in competition with the original authors.
We propose these solutions:
1) Through the Git history, the work added by another team at any phase of the project can be quantified, contributing to a new concept of authorship in scientific projects and helping to quantify Newton's famous ``\emph{standing on the shoulders of giants}'' quote.
However, this is a long-term goal and requires major changes to academic value systems.
2) Authors can be given a grace period where the journal or a third party embargoes the source, keeping it private for the embargo period and then publishing it.

Other implementations of the criteria, or future improvements in Maneage, may solve some of the caveats above.
However, the proof of concept already shows many advantages to adopting the criteria.
For example, publication of projects with these criteria on a wide scale allows automatic workflow generation, optimized for desired characteristics of the results (for example via machine learning).
Because of the completeness criteria, algorithms and data selection can be similarly optimized.
Furthermore, through elements like the macros, natural language processing can also be included, automatically analyzing the connection between an analysis with the resulting narrative \emph{and} history of that analysis/narrative.
Parsers can be written over projects for meta-research and data provenance studies, for example to generate ``research objects''.
As another example, when a bug is found in one software package, all affected projects can be found and the scale of the effect can be measured.
Combined with SoftwareHeritage, precise high-level science parts of Maneage projects can be accurately cited (e.g., failed/abandoned tests at any historical point).
Many components of ``machine-actionable'' data management plans can be automatically filled out by Maneage, which is useful for project PIs and grant funders.

From the data repository perspective these criteria can also be very useful, for example with regard to the challenges mentioned in \cite{austin17}:
(1) The burden of curation is shared among all project authors and/or readers (who may find a bug and fix it), not just by data-base curators, improving their sustainability.
(2) Automated and persistent bi-directional linking of data and publication can be established through the published \& \emph{complete} data lineage that is under version control.
(3) Software management.
With these criteria, each project's unique and complete software management is included: its not a third-party PM, that needs to be maintained by the data center employees.
This enables easy management, preservation, publishing and citation of used software.
For example see \href{https://doi.org/10.5281/zenodo.3524937}{zenodo.3524937}, \href{https://doi.org/10.5281/zenodo.3408481}{zenodo.3408481}, \href{https://doi.org/10.5281/zenodo.1163746}{zenodo.1163746} where we have exploited the free software criteria to distribute all the used software tarballs with the project's source and deliverables.
(4) ``Linkages between documentation, code, data, and journal articles in an integrated environment'', which is the whole purpose of these criteria.





% use section* for acknowledgment
\section*{Acknowledgment}

The authors wish to thank (sorted alphabetically)
Julia Aguilar-Cabello,
Alice Allen,
Pedram Ashofteh Ardakani,
Roland Bacon,
Surena Fatemi,
Fabrizio Gagliardi,
Konrad Hinsen,
Mohammad-reza Khellat,
Johan Knapen,
Tamara Kovazh,
Ryan O'Connor,
Simon Portegies Zwart,
Idafen Santana-P\'erez,
Elham Saremi,
Yahya Sefidbakht,
Zahra Sharbaf,
Nadia Tonello,
and Ignacio Trujillo
for their useful help, suggestions and feedback on Maneage and this paper.

Work on Maneage, and this paper, has been partially funded/supported by the following institutions:
The Japanese Ministry of Education, Culture, Sports, Science, and Technology (MEXT) PhD scholarship to M. Akhlaghi and its Grant-in-Aid for Scientific Research (21244012, 24253003).
The European Research Council (ERC) advanced grant 339659-MUSICOS.
The European Union (EU) Horizon 2020 (H2020) research and innovation programmes No 777388 under RDA EU 4.0 project, and Marie Sk\l{}odowska-Curie grant agreement No 721463 to the SUNDIAL ITN.
The State Research Agency (AEI) of the Spanish Ministry of Science, Innovation and Universities (MCIU) and the European Regional Development Fund (ERDF) under the grant AYA2016-76219-P.
The IAC project P/300724, financed by the MCIU, through the Canary Islands Department of Economy, Knowledge and Employment.
The Fundaci\'on BBVA under its 2017 programme of assistance to scientific research groups, for the project ``Using machine-learning techniques to drag galaxies from the noise in deep imaging''.
The ``A next-generation worldwide quantum sensor network with optical atomic clocks'' project of the TEAM IV programme of the Foundation for Polish Science co-financed by the EU under ERDF.
The Polish MNiSW grant DIR/WK/2018/12.
The Pozna\'n Supercomputing and Networking Center (PSNC) computational grant 314.









%% Bibliography
\bibliographystyle{IEEEtran}
\bibliography{IEEEabrv,references}

%% Biography
\begin{IEEEbiographynophoto}{Mohammad Akhlaghi}
  is a postdoctoral researcher at the Instituto de Astrof\'isica de Canarias, Tenerife, Spain.
  His main scientific interest is in early galaxy evolution, but to extract information from the modern complex datasets, he has been involved in image processing and reproducible workflow management where he has founded GNU Astronomy Utilities (Gnuastro) and Maneage (introduced here).
  He received his PhD in astronomy from Tohoku University, Sendai Japan, and before coming to Tenerife, held a CNRS postdoc position at the Centre de Recherche Astrophysique de Lyon (CRAL).
  Contact him at mohammad@akhlaghi.org and find his website at \url{https://akhlaghi.org}.
\end{IEEEbiographynophoto}

\begin{IEEEbiographynophoto}{Ra\'ul Infante-Sainz}
  is a doctoral student at the Instituto de Astrof\'isica de Canarias, Tenerife, Spain.
  Contact him at infantesainz@gmail.com.
\end{IEEEbiographynophoto}

\begin{IEEEbiographynophoto}{Boudewijn F. Roukema}
  is a professor at the Institute of Astronomy in the Faculty of Physics, Astronomy and Informatics at Nicolaus Copernicus University in Toru\'n, Grudziadzka 5, Poland.
  His research includes galaxy formation, large scale structure of the Universe, cosmic topology and inhomogeneous cosmology.
  He is involved in experimental research aimed at improving standards in research reproducibility.
  Roukema obtained his PhD in astronomy and astrophysics at the Australian National University.
  Contact him at boud@astro.uni.torun.pl.
\end{IEEEbiographynophoto}

\begin{IEEEbiographynophoto}{David Valls-Gabaud}
  Observatoire de Paris

  David Valls-Gabaud is a CNRS Research Director at the Observatoire de Paris, France.
  His research interests span from cosmology and galaxy evolution to stellar physics and instrumentation.
  Contact him at david.valls-gabaud@obspm.fr.
\end{IEEEbiographynophoto}

\begin{IEEEbiographynophoto}{Roberto Baena-Gall\'e}
  is a postdoctoral researcher at the Instituto de Astrof\'isica de Canarias, Tenerife, Spain.
  Before enrolling IAC, he worked at University of Barcelona, Reial Acad\`emia de Ci\`encias i Arts de Barcelona, l'Universit\'e Pierre et Marie Curie and ONERA-The French Aerospace Lab.
  His research interests are image processing and resolution of inverse problems, with applications to AO corrected FOVs, satellite identification under atmospheric turbulence and retina images.
  He is currently involved in projects related with PSF estimation of large astronomic surveys and Machine Learning.
  Baena-Gall\'e has both MS in Telecommunication and Electronic Engineering from University of Seville (Spain), and received a PhD in astronomy from University of Barcelona (Spain).
  Contact him at rbaena@iac.es.
\end{IEEEbiographynophoto}

\end{document}

%% This file is free software: you can redistribute it and/or modify it
%% under the terms of the GNU General Public License as published by the
%% Free Software Foundation, either version 3 of the License, or (at your
%% option) any later version.
%
%% This file is distributed in the hope that it will be useful, but WITHOUT
%% ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
%% FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
%% for more details. See <http://www.gnu.org/licenses/>.
Browse the archive

https://gitlab.com/makhlaghi/maneage-paper.git