Content - b835ddc851f850cf2cf4d67675309286179dbe37 - 93d8635/workflow_gisaid_ingest/Snakefile

visit type:

https://github.com/vector-engineering/COVID19-CG

25 January 2021, 13:44:46 UTC

Tip revision: e9558dc11b31b908f3af142e403d33e91d417b8a authored by Albert Tian Chen on 24 January 2021, 10:43:42 UTC
Fix location typos

Tip revision: e9558dc

Snakefile

# coding: utf-8

import datetime
import gzip
import os

from pathlib import Path

configfile: "../config/config_gisaid.yaml"

envvars:
    "GISAID_URL",
    "GISAID_USERNAME",
    "GISAID_PASSWORD"

# Get today's date in ISO format (YYYY-MM-DD)
today_str = datetime.date.today().isoformat()


rule all:
    input:
        # Download latest data feed, process sequences
        download_status = os.path.join(
            config["data_folder"], "status", "download_" + today_str + ".done"
        ),
        copy_status = os.path.join(
            config["data_folder"], "status", "merge_sequences_" + today_str + ".done"
        ),
        # Cleaned metadata
        metadata = os.path.join(config["data_folder"], "metadata.csv")


rule download:
    """Download the data feed JSON object from the GISAID database, using our data feed credentials. The resulting file will need to be decompressed by `decompress_data_feed`
    """
    output:
        feed = temp(os.path.join(config["data_folder"], "feed.json")),
        status = touch(rules.all.input.download_status)
    threads: workflow.cores
    shell:
        "scripts/download.sh | unxz --threads={threads} -c - > {output.feed}"


rule process_feed:
    """Split up the data feed's individual JSON objects into metadata and fasta files. Chunk the fasta files so that every day we only reprocess the subset of fasta files that have changed. The smaller the chunk size, the more efficient the updates, but the more files on the filesystem.
    On a 48-core workstation with 128 GB RAM, aligning 200 sequences takes about 10 minutes, and this is more acceptable than having to align 1000 sequences, which takes ~1 hour. We end up with hundreds of files, but the filesystem seems to be handling it well.
    """
    input:
        feed = rules.download.output.feed,
    output:
        metadata_dirty = temp(os.path.join(config["data_folder"], "metadata_dirty.csv")),
        status = touch(rules.all.input.copy_status)
    params:
        fasta = directory(os.path.join(config["data_folder"], "fasta_raw"))
    threads: workflow.cores
    shell:
        """
        python3 scripts/process_feed.py -d {input.feed} -f {params.fasta} -m {output.metadata_dirty} -p {threads}
        """


rule clean_metadata:
    """Clean up metadata from GISAID
    """
    input:
        metadata_dirty = rules.process_feed.output.metadata_dirty,
        location_corrections = os.path.join(
            config["static_data_folder"], "location_corrections.csv"
        ),
    output:
        metadata_clean = os.path.join(config["data_folder"], "metadata.csv")
    shell:
        """
        python3 scripts/clean_metadata.py -i {input.metadata_dirty} -l {input.location_corrections} -o {output.metadata_clean}
        """