Revision 6a6fdb21cbe59e84d205d62b7cdc50c9089890a4 authored by Jaeho Shin on 28 March 2016, 18:42:56 UTC, committed by Jaeho Shin on 28 March 2016, 18:42:56 UTC
1 parent b679de0
Raw File
deepdive-load
#!/usr/bin/env bash
# deepdive-load -- Loads a given relation's data
# > deepdive load RELATION[(COLUMN[,COLUMN]...)] [SOURCE...]
# Initializes given RELATION in the database and loads data from SOURCE for
# optionally specified COLUMNs.  When SOURCE is unspecified, the data is loaded
# from files found on path input/RELATION.* under the DeepDive application.
#
# When RELATION is a random variable and no COLUMN is specified, the SOURCE are
# expected to contain the `label` column at the end, in addition to all
# user-defined ones.
#
# This command must be run under an environment whose DEEPDIVE_DB_URL variable
# is set to a proper URL, or under a DeepDive application where the URL is set
# in the db.url file.
#
# The format of each SOURCE is determined by the filename extension, such as
# `.tsv` or `.csv`.  Defining the DEEPDIVE_LOAD_FORMAT environment to tsv, csv,
# or sql assumes a particular format, ignoring the filename.  Defining
# DEEPDIVE_LOAD_FORMAT_DEFAULT environment will fallback to the specified
# format when it cannot be determined from the filename.
#
# For example:
#
# > deepdive load sentences
# creates table 'sentences' and loads data from input/sentences.tsv if there
# is one.
#
# > deepdive load sentences path/to/sentences.tsv
# creates table 'sentences' and loads data from path/to/sentences.tsv.
#
# > deepdive load sentences sentences.csv.bz2
# creates table 'sentences' and loads from a compressed CSV file.
#
# > deepdive load sentences sentences.csv.sh
# creates table 'sentences' and loads the CSV output of a shell script.
##
set -eu

# fallback to a data format or skip detection of it and assume a particular one
: ${DEEPDIVE_LOAD_FORMAT_DEFAULT:=} ${DEEPDIVE_LOAD_FORMAT:=}

[[ $# -gt 0 ]] || usage "$0" "Missing RELATION to load"
Relation=$1; shift
Columns=

# parse optional columns following relation name
case $Relation in
    *"("*")")
        # keep column names separate
        Columns="${Relation#*"("}"
        Columns=${Columns%")"}
        Relation=${Relation%%"("*}
esac

# when no SOURCE is specified, look under input/
if [[ $# -eq 0 ]]; then
    DEEPDIVE_APP=$(find-deepdive-app)
    export DEEPDIVE_APP
    . load-db-driver.sh

    cd "$DEEPDIVE_APP"
    initsh=input/init_"$Relation".sh
    # check if init_RELATION.sh script is available
    if [[ -x "$initsh" ]]; then
        echo "Loading $Relation via $initsh"
        exec "$initsh" "$@"
    fi
    # search under input/RELATION.*
    for Path in input/"$Relation".{tsv,csv,json-seq,sql}{,.bz2,.gz,.sh}; do
        [[ -e "$Path" ]] || continue
        break
    done
    [[ -e "$Path" ]] ||
        usage "$0" "input/$Relation.*: No data source found for $Relation"
    # use the found path
    set -- "$Path"
else
    # rely on the app if found (optional)
    DEEPDIVE_APP=$(find-deepdive-app 2>/dev/null) || true
    export DEEPDIVE_APP
    . load-db-driver.sh
fi

# find the columns to load (when no $Columns were explicitly specified and
# $Relation is a random variable, load the user-defined columns with the
# internal label column but nothing else)
if [[ -z "$Columns" && -e "$DEEPDIVE_APP" ]] && app-has-been-compiled; then
    Columns=$(
        cd "$DEEPDIVE_APP"
        Relation=$Relation \
        jq -r '
            .deepdive_.schema.relations[env.Relation] |
            if .variable_type then
                .columns | to_entries |
                sort_by(.value.index) |
                map(.key)+["label"] | join(",")
            else
                empty
            end
        ' run/compiled/config.json
    )
fi

# how to load a set of source expressions
# (eval is used here to support process substitution for on-the-fly decompression)
currentFormat= pathsInFormat=() sourcesInFormat=
loadQueued() {
    [[ -n ${currentFormat:=$DEEPDIVE_LOAD_FORMAT_DEFAULT} ]] ||
        error "Specify DEEPDIVE_LOAD_FORMAT= or DEEPDIVE_LOAD_FORMAT_DEFAULT="
    echo "Loading $Relation${Columns:+($Columns)} from ${pathsInFormat[*]:-/dev/stdin} ($currentFormat format)"
    case $currentFormat in
        sql)
            eval "cat $sourcesInFormat" | db-prompt
            ;;
        *)
            eval 'db-load "$Relation" "$Columns" "$currentFormat" '"$sourcesInFormat"
    esac
}
queue() {
    pathsInFormat+=("$1")
    sourcesInFormat+="$2 "
}

# assume a particular format when explicitly specified
currentFormat=$DEEPDIVE_LOAD_FORMAT

# group given paths by format, and batch call to load to support parallel load
format=$currentFormat
for path; do
    # determine the format
    if [[ -z "$DEEPDIVE_LOAD_FORMAT" ]]; then
        case $path in
            *.tsv|*.tsv.*)           format=tsv      ;;
            *.csv|*.csv.*)           format=csv      ;;
            *.json-seq|*.json-seq.*) format=json-seq ;;
            *.sql|*.sql.*)           format=sql      ;;
            *)
                [[ -n "$DEEPDIVE_LOAD_FORMAT_DEFAULT" ]] ||
                    error "$path: Unrecognized format, specify DEEPDIVE_LOAD_FORMAT= or DEEPDIVE_LOAD_FORMAT_DEFAULT="
                format=$DEEPDIVE_LOAD_FORMAT_DEFAULT
        esac
    fi
    # and how to decompress if needed
    case $path in
        *.bz2)
            if type pbzip2 &>/dev/null
            then decompress="pbzip2 -d -k -c"
            else decompress=" bzip2 -d -k -c"
            fi ;;
        *.gz)
            if type pigz &>/dev/null
            then decompress="pigz -d -k -c"
            else decompress="gzip -d -k -c"
            fi ;;
        *.sh)  decompress=bash  ;;
        *)     decompress=
    esac

    # load once the format changes
    if [[ "$format" != "$currentFormat" ]]; then
        [[ -z "$currentFormat" ]] || loadQueued
        currentFormat=$format pathsInFormat=() sourcesInFormat=
    fi

    # queue a source expression
    if [[ -n "$decompress" ]]; then
        # wrapped with decompression process if needed
        queue "$path" "<($decompress $(escape4sh "$path"))"
    else
        queue "$path" "$(escape4sh "$path")"
    fi
done

# load everything queued so far
loadQueued
back to top