#!/usr/bin/env bash # deepdive-load -- Loads a given relation's data # > deepdive load RELATION[(COLUMN[,COLUMN]...)] [SOURCE...] # Initializes given RELATION in the database and loads data from SOURCE for # optionally specified COLUMNs. When SOURCE is unspecified, the data is loaded # from files found on path input/RELATION.* under the DeepDive application. # # When RELATION is a random variable and no COLUMN is specified, the SOURCE are # expected to contain the `label` column at the end, in addition to all # user-defined ones. # # This command must be run under an environment whose DEEPDIVE_DB_URL variable # is set to a proper URL, or under a DeepDive application where the URL is set # in the db.url file. # # The format of each SOURCE is determined by the filename extension, such as # `.tsv` or `.csv`. Defining the DEEPDIVE_LOAD_FORMAT environment to tsv, csv, # or sql assumes a particular format, ignoring the filename. Defining # DEEPDIVE_LOAD_FORMAT_DEFAULT environment will fallback to the specified # format when it cannot be determined from the filename. # # For example: # # > deepdive load sentences # creates table 'sentences' and loads data from input/sentences.tsv if there # is one. # # > deepdive load sentences path/to/sentences.tsv # creates table 'sentences' and loads data from path/to/sentences.tsv. # # > deepdive load sentences sentences.csv.bz2 # creates table 'sentences' and loads from a compressed CSV file. # # > deepdive load sentences sentences.csv.sh # creates table 'sentences' and loads the CSV output of a shell script. ## set -eu # fallback to a data format or skip detection of it and assume a particular one : ${DEEPDIVE_LOAD_FORMAT_DEFAULT:=} ${DEEPDIVE_LOAD_FORMAT:=} [[ $# -gt 0 ]] || usage "$0" "Missing RELATION to load" Relation=$1; shift Columns= # parse optional columns following relation name case $Relation in *"("*")") # keep column names separate Columns="${Relation#*"("}" Columns=${Columns%")"} Relation=${Relation%%"("*} esac # when no SOURCE is specified, look under input/ if [[ $# -eq 0 ]]; then DEEPDIVE_APP=$(find-deepdive-app) export DEEPDIVE_APP . load-db-driver.sh cd "$DEEPDIVE_APP" initsh=input/init_"$Relation".sh # check if init_RELATION.sh script is available if [[ -x "$initsh" ]]; then echo "Loading $Relation via $initsh" exec "$initsh" "$@" fi # search under input/RELATION.* for Path in input/"$Relation".{tsv,csv,json-seq,sql}{,.bz2,.gz,.sh}; do [[ -e "$Path" ]] || continue break done [[ -e "$Path" ]] || usage "$0" "input/$Relation.*: No data source found for $Relation" # use the found path set -- "$Path" else # rely on the app if found (optional) DEEPDIVE_APP=$(find-deepdive-app 2>/dev/null) || true export DEEPDIVE_APP . load-db-driver.sh fi # find the columns to load (when no $Columns were explicitly specified and # $Relation is a random variable, load the user-defined columns with the # internal label column but nothing else) if [[ -z "$Columns" && -e "$DEEPDIVE_APP" ]] && app-has-been-compiled; then Columns=$( cd "$DEEPDIVE_APP" Relation=$Relation \ jq -r ' .deepdive_.schema.relations[env.Relation] | if .variable_type then .columns | to_entries | sort_by(.value.index) | map(.key)+["label"] | join(",") else empty end ' run/compiled/config.json ) fi # how to load a set of source expressions # (eval is used here to support process substitution for on-the-fly decompression) currentFormat= pathsInFormat=() sourcesInFormat= loadQueued() { [[ -n ${currentFormat:=$DEEPDIVE_LOAD_FORMAT_DEFAULT} ]] || error "Specify DEEPDIVE_LOAD_FORMAT= or DEEPDIVE_LOAD_FORMAT_DEFAULT=" echo "Loading $Relation${Columns:+($Columns)} from ${pathsInFormat[*]:-/dev/stdin} ($currentFormat format)" case $currentFormat in sql) eval "cat $sourcesInFormat" | db-prompt ;; *) eval 'db-load "$Relation" "$Columns" "$currentFormat" '"$sourcesInFormat" esac } queue() { pathsInFormat+=("$1") sourcesInFormat+="$2 " } # assume a particular format when explicitly specified currentFormat=$DEEPDIVE_LOAD_FORMAT # group given paths by format, and batch call to load to support parallel load format=$currentFormat for path; do # determine the format if [[ -z "$DEEPDIVE_LOAD_FORMAT" ]]; then case $path in *.tsv|*.tsv.*) format=tsv ;; *.csv|*.csv.*) format=csv ;; *.json-seq|*.json-seq.*) format=json-seq ;; *.sql|*.sql.*) format=sql ;; *) [[ -n "$DEEPDIVE_LOAD_FORMAT_DEFAULT" ]] || error "$path: Unrecognized format, specify DEEPDIVE_LOAD_FORMAT= or DEEPDIVE_LOAD_FORMAT_DEFAULT=" format=$DEEPDIVE_LOAD_FORMAT_DEFAULT esac fi # and how to decompress if needed case $path in *.bz2) if type pbzip2 &>/dev/null then decompress="pbzip2 -d -k -c" else decompress=" bzip2 -d -k -c" fi ;; *.gz) if type pigz &>/dev/null then decompress="pigz -d -k -c" else decompress="gzip -d -k -c" fi ;; *.sh) decompress=bash ;; *) decompress= esac # load once the format changes if [[ "$format" != "$currentFormat" ]]; then [[ -z "$currentFormat" ]] || loadQueued currentFormat=$format pathsInFormat=() sourcesInFormat= fi # queue a source expression if [[ -n "$decompress" ]]; then # wrapped with decompression process if needed queue "$path" "<($decompress $(escape4sh "$path"))" else queue "$path" "$(escape4sh "$path")" fi done # load everything queued so far loadQueued