https://github.com/schloi/MARVEL
Raw File
Tip revision: 1d412c89c7c1de9bacbc0f58b014ab7a29170240 authored by Siegfried on 07 July 2023, 17:37:34 UTC
updates
Tip revision: 1d412c8
txt2track.c

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <ctype.h>
#include <unistd.h>
#include <assert.h>

#include <sys/param.h>

#include "lib/colors.h"
#include "lib/oflags.h"
#include "lib/pass.h"
#include "lib/tracks.h"
#include "lib/utils.h"
#include "lib/lasidx.h"

#include "db/DB.h"
#include "dalign/align.h"

typedef struct
{
    uint64_t rid;
    uint64_t nvalues;
    uint64_t idxvalues;
} READ_ANNO;

// getopt

extern char* optarg;
extern int optind, opterr, optopt;

static void usage()
{
    printf("[-c] <db> <text> <track>\n");
    printf("-c ... ensure values are valid positions within the reads\n");
}

static int cmp_read_anno(const void* x, const void* y)
{
    return ((const READ_ANNO*)x)->rid - ((const READ_ANNO*)y)->rid;
}

int main(int argc, char* argv[])
{
    HITS_DB db;

    // process arguments

    int c;
    int check = 0;

    opterr = 0;

    while ((c = getopt(argc, argv, "c")) != -1)
    {
        switch (c)
        {
            case 'c':
                check = 1;
                break;

            default:
                printf("Unknow option: %s\n", argv[optind - 1]);
                usage();
                exit(1);
        }
    }

    if (argc - optind < 3)
    {
        usage();
        exit(1);
    }

    char* pathDb = argv[optind++];
    char* pathTxt = argv[optind++];
    char* nameTrack = argv[optind++];

    if (Open_DB(pathDb, &db))
    {
        printf("could not open '%s'\n", pathDb);
        return 1;
    }

    FILE* fileIn;

    if (strcmp(pathTxt, "-") == 0)
    {
        fileIn = stdin;
    }
    else
    {
        fileIn = fopen(pathTxt, "r");

        if (fileIn == NULL)
        {
            fprintf(stderr, "failed to open '%s'\n", pathTxt);
            exit(1);
        }
    }

    track_data* ints = NULL;
    uint64_t nints = 0;
    size_t maxints = 0;

    char* line = NULL;
    size_t maxline = 0;
    ssize_t linelen;

    READ_ANNO* anno = NULL;
    size_t maxanno = 0;
    uint64_t nanno = 0;

    while ( ( linelen = getline(&line, &maxline, fileIn) ) > 0 )
    {
        assert( line[ strlen( line ) - 1 ] == '\n' );

        uint32_t col = 0;
        char* token;
        uint64_t readid;

        while ( ( token = strsep(&line, " \t") ) )
        {
            int64_t value = strtoll(token, NULL, 10);
            col += 1;

            if ( col == 1 )
            {
                readid = value;
                continue;
            }

            int rlen = DB_READ_LEN(&db, readid);

            if ( check && ( value < 0 || value > rlen ) )
            {
                fprintf(stderr, "invalid position %" PRId64 " for read %" PRIu64 "\n", value, readid);
            }

            if ( nints + 1 >= maxints )
            {
                maxints = 1.2 * maxints + 1000;
                ints = realloc(ints, sizeof(track_data) * maxints);
            }

            ints[ nints ] = value;
            nints += 1;

        }

        if (nanno + 1 >= maxanno )
        {
            maxanno = maxanno * 1.2 + 1000;
            anno = realloc(anno, sizeof(READ_ANNO) * maxanno);
        }

        anno[ nanno ].rid = readid;
        anno[ nanno ].nvalues = col - 1;
        anno[ nanno ].idxvalues = nints - ( col - 1 );

        nanno += 1;
    }

    qsort(anno, nanno, sizeof(READ_ANNO), cmp_read_anno);

    track_anno* tanno = malloc( sizeof(track_anno) * (DB_NREADS(&db) + 1) );
    track_data* tdata = NULL;
    uint64_t ntdata = 0;
    size_t maxtdata = 0;

    bzero(tanno, sizeof(track_anno) * (DB_NREADS(&db) + 1));

    uint64_t i;
    for (i = 0; i < nanno; i++ )
    {
        READ_ANNO* ra = anno + i;

        if (ntdata + ra->nvalues >= maxtdata)
        {
            maxtdata = maxtdata * 1.2 + ra->nvalues;
            tdata = realloc( tdata, sizeof(track_data) * maxtdata );
        }

        tanno[ ra->rid ] += ra->nvalues * sizeof(track_data);

        uint64_t j;
        for ( j = 0; j < ra->nvalues; j++ )
        {
            tdata[ ntdata ] = ints[ ra->idxvalues + j ];
            ntdata += 1;
        }
    }

    free(anno);

    track_anno coff, off;
    off = 0;

    for (i = 0; i <= (uint64_t)DB_NREADS(&db); i++)
    {
        coff = tanno[i];
        tanno[i] = off;
        off += coff;
    }

    track_write(&db, nameTrack, 0, tanno, tdata, ntdata);

    free(tanno);
    free(tdata);

    Close_DB(&db);

    return 0;
}
back to top