From 1f94ce4c8eca5431a6e451e6bf7a983d91a6970f Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 14 Aug 2018 17:04:03 +0100 Subject: [PATCH] Move regidx code to htslib --- Makefile | 23 +- consensus.c | 2 +- csq.c | 2 +- mpileup.c | 2 +- ploidy.h | 2 +- plugins/fixploidy.mk | 4 +- plugins/mendelian.c | 2 +- regidx.c | 650 ------------------------------------------- regidx.h | 200 ------------- test/test-regidx.c | 392 -------------------------- vcfmerge.c | 2 +- 11 files changed, 15 insertions(+), 1266 deletions(-) delete mode 100644 regidx.c delete mode 100644 regidx.h delete mode 100644 test/test-regidx.c diff --git a/Makefile b/Makefile index 2c60f21df..860549626 100644 --- a/Makefile +++ b/Makefile @@ -38,7 +38,7 @@ OBJS = main.o vcfindex.o tabix.o \ vcfnorm.o vcfgtcheck.o vcfview.o vcfannotate.o vcfroh.o vcfconcat.o \ vcfcall.o mcall.o vcmp.o gvcf.o reheader.o convert.o vcfconvert.o tsv2vcf.o \ vcfcnv.o HMM.o vcfplugin.o consensus.o ploidy.o bin.o hclust.o version.o \ - regidx.o smpl_ilist.o csq.o vcfbuf.o \ + smpl_ilist.o csq.o vcfbuf.o \ mpileup.o bam2bcf.o bam2bcf_indel.o bam_sample.o \ vcfsort.o \ ccall.o em.o prob1.o kmin.o # the original samtools calling @@ -72,7 +72,7 @@ MISC_SCRIPTS = \ misc/plot-roh.py \ misc/run-roh.pl \ misc/vcfutils.pl -TEST_PROGRAMS = test/test-rbuf test/test-regidx +TEST_PROGRAMS = test/test-rbuf all: $(PROGRAMS) $(TEST_PROGRAMS) plugins @@ -183,7 +183,7 @@ tsv2vcf_h = tsv2vcf.h $(htslib_vcf_h) filter_h = filter.h $(htslib_vcf_h) gvcf_h = gvcf.h $(bcftools_h) khash_str2str_h = khash_str2str.h $(htslib_khash_h) -ploidy_h = ploidy.h regidx.h +ploidy_h = ploidy.h $(htslib_regidx_h) prob1_h = prob1.h $(htslib_vcf_h) $(call_h) smpl_ilist_h = smpl_ilist.h $(htslib_vcf_h) vcfbuf_h = vcfbuf.h $(htslib_vcf_h) @@ -200,7 +200,7 @@ vcffilter.o: vcffilter.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_ vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) hclust.h vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h) $(htslib_kstring_h) $(htslib_bgzf_h) $(bcftools_h) vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) -vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) regidx.h $(bcftools_h) vcmp.h $(htslib_khash_h) +vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(htslib_regidx_h) $(bcftools_h) vcmp.h $(htslib_khash_h) vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h) vcfroh.o: vcfroh.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(bcftools_h) HMM.h $(smpl_ilist_h) $(filter_h) @@ -227,9 +227,8 @@ ploidy.o: ploidy.c $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_hts_h) $( polysomy.o: polysomy.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(bcftools_h) peakfit.h peakfit.o: peakfit.c peakfit.h $(htslib_hts_h) $(htslib_kstring_h) bin.o: bin.c $(bcftools_h) bin.h -regidx.o: regidx.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) regidx.h -consensus.o: consensus.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) regidx.h $(bcftools_h) rbuf.h $(filter_h) -mpileup.o: mpileup.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) regidx.h $(bcftools_h) $(bam2bcf_h) $(bam_sample_h) $(gvcf_h) +consensus.o: consensus.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_regidx_h) $(bcftools_h) rbuf.h $(filter_h) +mpileup.o: mpileup.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(htslib_regidx_h) $(bcftools_h) $(bam2bcf_h) $(bam_sample_h) $(gvcf_h) bam2bcf.o: bam2bcf.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(bam2bcf_h) mw.h bam2bcf_indel.o: bam2bcf_indel.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(htslib_ksort_h) bam_sample.o: bam_sample.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(khash_str2str_h) $(bam_sample_h) $(bcftools_h) @@ -238,7 +237,7 @@ hclust.o: hclust.c $(htslib_hts_h) $(htslib_kstring_h) $(bcftools_h) hclust.h HMM.o: HMM.c $(htslib_hts_h) HMM.h vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(bcftools_h) $(vcfbuf_h) rbuf.h smpl_ilist.o: smpl_ilist.c $(bcftools_h) $(smpl_ilist_h) -csq.o: csq.c $(htslib_hts_h) $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_h) $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) regidx.h kheap.h $(smpl_ilist_h) rbuf.h +csq.o: csq.c $(htslib_hts_h) $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_h) $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) $(htslib_regidx_h) kheap.h $(smpl_ilist_h) rbuf.h # test programs @@ -247,12 +246,10 @@ csq.o: csq.c $(htslib_hts_h) $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(hts # (regression.sh sets $REF_PATH to a subdirectory itself.) test-no-plugins: $(PROGRAMS) $(TEST_PROGRAMS) $(BGZIP) $(TABIX) ./test/test-rbuf - ./test/test-regidx REF_PATH=: ./test/test.pl --exec bgzip=$(BGZIP) --exec tabix=$(TABIX) test-plugins: $(PROGRAMS) $(TEST_PROGRAMS) $(BGZIP) $(TABIX) plugins ./test/test-rbuf - ./test/test-regidx REF_PATH=: ./test/test.pl --plugins --exec bgzip=$(BGZIP) --exec tabix=$(TABIX) test/test-rbuf.o: test/test-rbuf.c rbuf.h @@ -260,12 +257,6 @@ test/test-rbuf.o: test/test-rbuf.c rbuf.h test/test-rbuf: test/test-rbuf.o $(CC) $(LDFLAGS) -o $@ $^ $(ALL_LIBS) -test/test-regidx.o: test/test-regidx.c $(htslib_kstring_h) regidx.h - -test/test-regidx: test/test-regidx.o regidx.o $(HTSLIB) - $(CC) $(ALL_LDFLAGS) -o $@ $^ $(HTSLIB) -lpthread $(HTSLIB_LIB) $(ALL_LIBS) - - # make docs target depends the a2x asciidoc program doc/bcftools.1: doc/bcftools.txt cd doc && a2x -adate="$(DOC_DATE)" -aversion=$(DOC_VERSION) --doctype manpage --format manpage bcftools.txt diff --git a/consensus.c b/consensus.c index 2c4a9040e..e9217fa74 100644 --- a/consensus.c +++ b/consensus.c @@ -37,7 +37,7 @@ #include #include #include -#include "regidx.h" +#include #include "bcftools.h" #include "rbuf.h" #include "filter.h" diff --git a/csq.c b/csq.c index 8f9da5d27..2383b4233 100644 --- a/csq.c +++ b/csq.c @@ -146,7 +146,7 @@ #include #include "bcftools.h" #include "filter.h" -#include "regidx.h" +#include #include "kheap.h" #include "smpl_ilist.h" #include "rbuf.h" diff --git a/mpileup.c b/mpileup.c index de5988c1d..c3f7bbabe 100644 --- a/mpileup.c +++ b/mpileup.c @@ -39,7 +39,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include "regidx.h" +#include #include "bcftools.h" #include "bam2bcf.h" #include "bam_sample.h" diff --git a/ploidy.h b/ploidy.h index 1e7d2f78f..6deef737f 100644 --- a/ploidy.h +++ b/ploidy.h @@ -55,7 +55,7 @@ #ifndef __PLOIDY_H__ #define __PLOIDY_H__ -#include "regidx.h" +#include typedef struct _ploidy_t ploidy_t; diff --git a/plugins/fixploidy.mk b/plugins/fixploidy.mk index 300654da3..5838a73c2 100644 --- a/plugins/fixploidy.mk +++ b/plugins/fixploidy.mk @@ -1,2 +1,2 @@ -plugins/fixploidy.so: plugins/fixploidy.c version.h version.c regidx.h regidx.c ploidy.h ploidy.c - $(CC) $(PLUGIN_FLAGS) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(LDFLAGS) -o $@ ploidy.c regidx.c version.c $< $(LIBS) +plugins/fixploidy.so: plugins/fixploidy.c version.h version.c ploidy.h ploidy.c + $(CC) $(PLUGIN_FLAGS) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(LDFLAGS) -o $@ ploidy.c version.c $< $(LIBS) diff --git a/plugins/mendelian.c b/plugins/mendelian.c index 88b0d962a..2d511e155 100644 --- a/plugins/mendelian.c +++ b/plugins/mendelian.c @@ -36,7 +36,7 @@ #include #include // for isatty #include "bcftools.h" -#include "regidx.h" +#include #define MODE_COUNT 1 #define MODE_LIST_GOOD 2 diff --git a/regidx.c b/regidx.c deleted file mode 100644 index 5c6c8ce59..000000000 --- a/regidx.c +++ /dev/null @@ -1,650 +0,0 @@ -/* - Copyright (C) 2014-2017 Genome Research Ltd. - - Author: Petr Danecek - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#include -#include -#include -#include -#include -#include "regidx.h" - -#define MAX_COOR_0 REGIDX_MAX // CSI and hts_itr_query limit, 0-based - -#define iBIN(x) ((x)>>13) - -typedef struct -{ - uint32_t beg, end; -} -reg_t; - -typedef struct -{ - uint32_t pos, ireg; // y-coordinate and a pointer to reglist.reg and reglist.dat -} -pos_t; - -typedef struct _reglist_t reglist_t; - -typedef struct -{ - uint32_t beg, end, ireg; // query coordinates and the active region - regidx_t *ridx; - reglist_t *list; - int active; -} -_itr_t; - -// List of regions for one chromosome. -struct _reglist_t -{ - uint32_t *idx, nidx; // index to list.reg+1 - uint32_t nreg, mreg; // n:used, m:allocated - reg_t *reg; // regions - void *dat; // payload data - char *seq; // sequence name - int unsorted; - -}; - -// Container of all sequences -struct _regidx_t -{ - int nseq, mseq; // n:used, m:alloced - reglist_t *seq; // regions for each sequence - void *seq2regs; // hash for fast lookup from chr name to regions - char **seq_names; - regidx_free_f free; // function to free any data allocated by regidx_parse_f - regidx_parse_f parse; // parse one input line - void *usr; // user data to pass to regidx_parse_f - int payload_size; - void *payload; // temporary payload data set by regidx_parse_f (sequence is not known beforehand) - kstring_t str; -}; - -int regidx_seq_nregs(regidx_t *idx, const char *seq) -{ - int iseq; - if ( khash_str2int_get(idx->seq2regs, seq, &iseq)!=0 ) return 0; // no such sequence - return idx->seq[iseq].nreg; -} - -int regidx_nregs(regidx_t *idx) -{ - int i, nreg = 0; - for (i=0; inseq; i++) nreg += idx->seq[i].nreg; - return nreg; -} - -char **regidx_seq_names(regidx_t *idx, int *n) -{ - *n = idx->nseq; - return idx->seq_names; -} - -int regidx_insert_list(regidx_t *idx, char *line, char delim) -{ - kstring_t tmp = {0,0,0}; - char *ss = line; - while ( *ss ) - { - char *se = ss; - while ( *se && *se!=delim ) se++; - tmp.l = 0; - kputsn(ss, se-ss, &tmp); - if ( regidx_insert(idx,tmp.s) < 0 ) - { - free(tmp.s); - return -1; - } - if ( !*se ) break; - ss = se+1; - } - free(tmp.s); - return 0; -} - -static inline int cmp_regs(reg_t *a, reg_t *b) -{ - if ( a->beg < b->beg ) return -1; - if ( a->beg > b->beg ) return 1; - if ( a->end < b->end ) return 1; // longer intervals come first - if ( a->end > b->end ) return -1; - return 0; -} -static int cmp_reg_ptrs(const void *a, const void *b) -{ - return cmp_regs((reg_t*)a,(reg_t*)b); -} -static int cmp_reg_ptrs2(const void *a, const void *b) -{ - return cmp_regs(*((reg_t**)a),*((reg_t**)b)); -} - -inline int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, uint32_t beg, uint32_t end, void *payload) -{ - if ( beg > MAX_COOR_0 ) beg = MAX_COOR_0; - if ( end > MAX_COOR_0 ) end = MAX_COOR_0; - - int rid; - idx->str.l = 0; - kputsn(chr_beg, chr_end-chr_beg+1, &idx->str); - if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 ) - { - // new chromosome - idx->nseq++; - int m_prev = idx->mseq; - hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq); - hts_expand0(char*,idx->nseq,m_prev,idx->seq_names); - idx->seq_names[idx->nseq-1] = strdup(idx->str.s); - rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]); - } - - reglist_t *list = &idx->seq[rid]; - list->seq = idx->seq_names[rid]; - list->nreg++; - int mreg = list->mreg; - hts_expand(reg_t,list->nreg,list->mreg,list->reg); - list->reg[list->nreg-1].beg = beg; - list->reg[list->nreg-1].end = end; - if ( idx->payload_size ) - { - if ( mreg != list->mreg ) list->dat = realloc(list->dat,idx->payload_size*list->mreg); - memcpy((char *)list->dat + idx->payload_size*(list->nreg-1), payload, idx->payload_size); - } - if ( !list->unsorted && list->nreg>1 && cmp_regs(&list->reg[list->nreg-2],&list->reg[list->nreg-1])>0 ) list->unsorted = 1; - return 0; -} - -int regidx_insert(regidx_t *idx, char *line) -{ - if ( !line ) return 0; - char *chr_from, *chr_to; - uint32_t beg,end; - int ret = idx->parse(line,&chr_from,&chr_to,&beg,&end,idx->payload,idx->usr); - if ( ret==-2 ) return -1; // error - if ( ret==-1 ) return 0; // skip the line - regidx_push(idx, chr_from,chr_to,beg,end,idx->payload); - return 0; -} - -regidx_t *regidx_init_string(const char *str, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat) -{ - regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t)); - if ( !idx ) return NULL; - - idx->free = free_f; - idx->parse = parser ? parser : regidx_parse_tab; - idx->usr = usr_dat; - idx->seq2regs = khash_str2int_init(); - idx->payload_size = payload_size; - if ( payload_size ) idx->payload = malloc(payload_size); - - kstring_t tmp = {0,0,0}; - const char *ss = str; - while ( *ss ) - { - while ( *ss && isspace(*ss) ) ss++; - const char *se = ss; - while ( *se && *se!='\r' && *se!='\n' ) se++; - tmp.l = 0; - kputsn(ss, se-ss, &tmp); - regidx_insert(idx,tmp.s); - while ( *se && isspace(*se) ) se++; - ss = se; - } - free(tmp.s); - return idx; -} - -regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat) -{ - if ( !parser ) - { - if ( !fname ) parser = regidx_parse_tab; - else - { - int len = strlen(fname); - if ( len>=7 && !strcasecmp(".bed.gz",fname+len-7) ) - parser = regidx_parse_bed; - else if ( len>=8 && !strcasecmp(".bed.bgz",fname+len-8) ) - parser = regidx_parse_bed; - else if ( len>=4 && !strcasecmp(".bed",fname+len-4) ) - parser = regidx_parse_bed; - else if ( len>=4 && !strcasecmp(".vcf",fname+len-4) ) - parser = regidx_parse_vcf; - else if ( len>=7 && !strcasecmp(".vcf.gz",fname+len-7) ) - parser = regidx_parse_vcf; - else - parser = regidx_parse_tab; - } - } - - regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t)); - idx->free = free_f; - idx->parse = parser; - idx->usr = usr_dat; - idx->seq2regs = khash_str2int_init(); - idx->payload_size = payload_size; - if ( payload_size ) idx->payload = malloc(payload_size); - - if ( !fname ) return idx; - - kstring_t str = {0,0,0}; - - htsFile *fp = hts_open(fname,"r"); - if ( !fp ) goto error; - - while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 ) - { - if ( regidx_insert(idx, str.s) ) goto error; - } - - free(str.s); - if ( hts_close(fp)!=0 ) - { - fprintf(stderr,"[%s] Error: close failed .. %s\n", __func__,fname); - goto error; - } - return idx; - -error: - free(str.s); - if ( fp ) hts_close(fp); - regidx_destroy(idx); - return NULL; -} - -void regidx_destroy(regidx_t *idx) -{ - int i, j; - for (i=0; inseq; i++) - { - reglist_t *list = &idx->seq[i]; - if ( idx->free ) - { - for (j=0; jnreg; j++) - idx->free((char *)list->dat + idx->payload_size*j); - } - free(list->dat); - free(list->reg); - free(list->idx); - } - free(idx->seq_names); - free(idx->seq); - free(idx->str.s); - free(idx->payload); - khash_str2int_destroy_free(idx->seq2regs); - free(idx); -} - -int _reglist_build_index(regidx_t *regidx, reglist_t *list) -{ - int i; - if ( list->unsorted ) - { - if ( !regidx->payload_size ) - qsort(list->reg,list->nreg,sizeof(reg_t),cmp_reg_ptrs); - else - { - reg_t **ptr = (reg_t**) malloc(sizeof(reg_t*)*list->nreg); - for (i=0; inreg; i++) ptr[i] = list->reg + i; - qsort(ptr,list->nreg,sizeof(*ptr),cmp_reg_ptrs2); - - void *tmp_dat = malloc(regidx->payload_size*list->nreg); - for (i=0; inreg; i++) - { - size_t iori = ptr[i] - list->reg; - memcpy((char *)tmp_dat+i*regidx->payload_size, - (char *)list->dat+iori*regidx->payload_size, - regidx->payload_size); - } - free(list->dat); - list->dat = tmp_dat; - - reg_t *tmp_reg = (reg_t*) malloc(sizeof(reg_t)*list->nreg); - for (i=0; inreg; i++) - { - size_t iori = ptr[i] - list->reg; - tmp_reg[i] = list->reg[iori]; - } - free(ptr); - free(list->reg); - list->reg = tmp_reg; - list->mreg = list->nreg; - } - list->unsorted = 0; - } - - list->nidx = 0; - int j,k, midx = 0; - for (j=0; jnreg; j++) - { - int ibeg = iBIN(list->reg[j].beg); - int iend = iBIN(list->reg[j].end); - if ( midx <= iend ) - { - int old_midx = midx; - midx = iend + 1; - kroundup32(midx); - list->idx = (uint32_t*) realloc(list->idx, midx*sizeof(uint32_t)); - memset(list->idx+old_midx, 0, sizeof(uint32_t)*(midx-old_midx)); - } - if ( ibeg==iend ) - { - if ( !list->idx[ibeg] ) list->idx[ibeg] = j + 1; - } - else - { - for (k=ibeg; k<=iend; k++) - if ( !list->idx[k] ) list->idx[k] = j + 1; - } - if ( list->nidx < iend+1 ) list->nidx = iend+1; - } - - return 0; -} - -int regidx_overlap(regidx_t *regidx, const char *chr, uint32_t beg, uint32_t end, regitr_t *regitr) -{ - if ( regitr ) regitr->seq = NULL; - - int iseq, ireg; - if ( khash_str2int_get(regidx->seq2regs, chr, &iseq)!=0 ) return 0; // no such sequence - - reglist_t *list = ®idx->seq[iseq]; - if ( !list->nreg ) return 0; - - if ( list->nreg==1 ) - { - if ( beg > list->reg[0].end ) return 0; - if ( end < list->reg[0].beg ) return 0; - ireg = 0; - } - else - { - if ( !list->idx ) - _reglist_build_index(regidx,list); - - int ibeg = iBIN(beg); - if ( ibeg >= list->nidx ) return 0; // beg is too big - - // find a matching region - uint32_t i = list->idx[ibeg]; - if ( !i ) - { - int iend = iBIN(end); - if ( iend > list->nidx ) iend = list->nidx; - for (i=ibeg; i<=iend; i++) - if ( list->idx[i] ) break; - if ( i>iend ) return 0; - i = list->idx[i]; - } - for (ireg=i-1; iregnreg; ireg++) - { - if ( list->reg[ireg].beg > end ) return 0; // no match, past the query region - if ( list->reg[ireg].end >= beg && list->reg[ireg].beg <= end ) break; // found - } - - if ( ireg >= list->nreg ) return 0; // no match - } - - if ( !regitr ) return 1; // match, but no more info to save - - // may need to iterate over the matching regions later - _itr_t *itr = (_itr_t*)regitr->itr; - itr->ridx = regidx; - itr->list = list; - itr->beg = beg; - itr->end = end; - itr->ireg = ireg; - itr->active = 0; - - regitr->seq = list->seq; - regitr->beg = list->reg[ireg].beg; - regitr->end = list->reg[ireg].end; - if ( regidx->payload_size ) - regitr->payload = (char *)list->dat + regidx->payload_size*ireg; - - return 1; -} - -int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) -{ - char *ss = (char*) line; - while ( *ss && isspace(*ss) ) ss++; - if ( !*ss ) return -1; // skip blank lines - if ( *ss=='#' ) return -1; // skip comments - - char *se = ss; - while ( *se && !isspace(*se) ) se++; - - *chr_beg = ss; - *chr_end = se-1; - - if ( !*se ) - { - // just the chromosome name - *beg = 0; - *end = MAX_COOR_0; - return 0; - } - - ss = se+1; - *beg = strtod(ss, &se); - if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; } - - ss = se+1; - *end = strtod(ss, &se) - 1; - if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; } - - return 0; -} - -int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) -{ - char *ss = (char*) line; - while ( *ss && isspace(*ss) ) ss++; - if ( !*ss ) return -1; // skip blank lines - if ( *ss=='#' ) return -1; // skip comments - - char *se = ss; - while ( *se && !isspace(*se) ) se++; - - *chr_beg = ss; - *chr_end = se-1; - - if ( !*se ) - { - // just the chromosome name - *beg = 0; - *end = MAX_COOR_0; - return 0; - } - - ss = se+1; - *beg = strtod(ss, &se); - if ( ss==se ) { fprintf(stderr,"Could not parse tab line: %s\n", line); return -2; } - if ( *beg==0 ) { fprintf(stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; } - (*beg)--; - - if ( !se[0] || !se[1] ) - *end = *beg; - else - { - ss = se+1; - *end = strtod(ss, &se); - if ( ss==se || (*se && !isspace(*se)) ) *end = *beg; - else if ( *end==0 ) { fprintf(stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; } - else (*end)--; - } - return 0; -} - -int regidx_parse_vcf(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) -{ - int ret = regidx_parse_tab(line, chr_beg, chr_end, beg, end, payload, usr); - if ( !ret ) *end = *beg; - return ret; -} - -int regidx_parse_reg(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) -{ - char *ss = (char*) line; - while ( *ss && isspace(*ss) ) ss++; - if ( !*ss ) return -1; // skip blank lines - if ( *ss=='#' ) return -1; // skip comments - - char *se = ss; - while ( *se && *se!=':' ) se++; - - *chr_beg = ss; - *chr_end = se-1; - - if ( !*se ) - { - *beg = 0; - *end = MAX_COOR_0; - return 0; - } - - ss = se+1; - *beg = strtod(ss, &se); - if ( ss==se ) { fprintf(stderr,"Could not parse reg line: %s\n", line); return -2; } - if ( *beg==0 ) { fprintf(stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; } - (*beg)--; - - if ( !se[0] || !se[1] ) - *end = se[0]=='-' ? MAX_COOR_0 : *beg; - else - { - ss = se+1; - *end = strtod(ss, &se); - if ( ss==se ) *end = *beg; - else if ( *end==0 ) { fprintf(stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; } - else (*end)--; - } - return 0; -} - -regitr_t *regitr_init(regidx_t *regidx) -{ - regitr_t *regitr = (regitr_t*) calloc(1,sizeof(regitr_t)); - regitr->itr = (_itr_t*) calloc(1,sizeof(_itr_t)); - _itr_t *itr = (_itr_t*) regitr->itr; - itr->ridx = regidx; - itr->list = NULL; - return regitr; -} - -void regitr_reset(regidx_t *regidx, regitr_t *regitr) -{ - _itr_t *itr = (_itr_t*) regitr->itr; - memset(itr,0,sizeof(_itr_t)); - itr->ridx = regidx; -} - -void regitr_destroy(regitr_t *regitr) -{ - free(regitr->itr); - free(regitr); -} - -int regitr_overlap(regitr_t *regitr) -{ - if ( !regitr->seq ) return 0; - - _itr_t *itr = (_itr_t*) regitr->itr; - if ( !itr->active ) - { - // is this the first call after regidx_overlap? - itr->active = 1; - itr->ireg++; - return 1; - } - - reglist_t *list = itr->list; - - int i; - for (i=itr->ireg; inreg; i++) - { - if ( list->reg[i].beg > itr->end ) return 0; // no match, past the query region - if ( list->reg[i].end >= itr->beg && list->reg[i].beg <= itr->end ) break; // found - } - - if ( i >= list->nreg ) return 0; // no match - - itr->ireg = i + 1; - regitr->seq = list->seq; - regitr->beg = list->reg[i].beg; - regitr->end = list->reg[i].end; - if ( itr->ridx->payload_size ) - regitr->payload = (char *)list->dat + itr->ridx->payload_size*i; - - return 1; -} - -int regitr_loop(regitr_t *regitr) -{ - _itr_t *itr = (_itr_t*) regitr->itr; - regidx_t *regidx = itr->ridx; - - if ( !itr->list ) // first time here - { - itr->list = regidx->seq; - itr->ireg = 0; - } - - size_t iseq = itr->list - regidx->seq; - if ( iseq >= regidx->nseq ) return 0; - - if ( itr->ireg >= itr->list->nreg ) - { - iseq++; - if ( iseq >= regidx->nseq ) return 0; // no more sequences, done - itr->ireg = 0; - itr->list = ®idx->seq[iseq]; - } - - regitr->seq = itr->list->seq; - regitr->beg = itr->list->reg[itr->ireg].beg; - regitr->end = itr->list->reg[itr->ireg].end; - if ( regidx->payload_size ) - regitr->payload = (char *)itr->list->dat + regidx->payload_size*itr->ireg; - itr->ireg++; - - return 1; -} - - -void regitr_copy(regitr_t *dst, regitr_t *src) -{ - _itr_t *dst_itr = (_itr_t*) dst->itr; - _itr_t *src_itr = (_itr_t*) src->itr; - *dst_itr = *src_itr; - *dst = *src; - dst->itr = dst_itr; -} - - diff --git a/regidx.h b/regidx.h deleted file mode 100644 index a654dbdd8..000000000 --- a/regidx.h +++ /dev/null @@ -1,200 +0,0 @@ -/* - Copyright (C) 2014-2016 Genome Research Ltd. - - Author: Petr Danecek - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -/* - Region indexing with an optional payload. - - Example of usage: - - // Init the parser and print regions. In this example the payload is a - // pointer to a string. For the description of parse_custom and - // free_custom functions, see regidx_parse_f and regidx_free_f below, - // and for working example see test/test-regidx.c. - regidx_t *idx = regidx_init(in_fname,parse_custom,free_custom,sizeof(char*),NULL); - - // Query overlap with chr:beg-end (beg,end are 1-based coordinates) - regitr_t *itr = regitr_init(idx); - if ( regidx_overlap(idx, chr,beg-1,end-1, itr) ) printf("There is an overlap!\n"); - - while ( regitr_overlap(itr) ) - { - printf("[%d,%d] overlaps with [%d,%d], payload=%s\n", beg,end, - itr->beg+1, itr->end+1, regitr_payload(itr,char*)); - } - - regidx_destroy(idx); - regitr_destroy(itr); - - - Another example, loop over all regions: - - regidx_t *idx = regidx_init(in_fname,NULL,NULL,0,NULL); - regitr_t *itr = regitr_init(idx); - - while ( regitr_loop(itr) ) - printf("chr=%s beg=%d end=%d\n", itr->seq, itr->beg+1, itr->end+1); - - regidx_destroy(idx); - regitr_destroy(itr); -*/ - -#ifndef __REGIDX_H__ -#define __REGIDX_H__ - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define REGIDX_MAX 2147483646 // maximum regidx coordinate (0-based) - -typedef struct _regidx_t regidx_t; -typedef struct -{ - uint32_t beg,end; - void *payload; - char *seq; - void *itr; -} -regitr_t; - -#define regitr_payload(itr,type_t) (*((type_t*)(itr)->payload)) - -/* - * regidx_parse_f - Function to parse one input line, such as regidx_parse_bed - * or regidx_parse_tab below. The function is expected to set `chr_from` and - * `chr_to` to point to first and last character of chromosome name and set - * coordinates `beg` and `end` (0-based, inclusive). If regidx_init() was - * called with non-zero payload_size, the `payload` points to a memory - * location of the payload_size and `usr` is the data passed to regidx_init(). - * Any memory allocated by the function will be freed by regidx_free_f called - * by regidx_destroy(). - * - * Return value: 0 on success, -1 to skip a record, -2 on fatal error. - */ -typedef int (*regidx_parse_f)(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr); -typedef void (*regidx_free_f)(void *payload); - -/* - * A note about the parsers: - * - leading spaces are ignored - * - lines starting with "#" are ignored - */ -int regidx_parse_bed(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*); // CHROM or whitespace-sepatated CHROM,FROM,TO (0-based,right-open) -int regidx_parse_tab(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*); // CHROM or whitespace-separated CHROM,POS (1-based, inclusive) -int regidx_parse_reg(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*); // CHROM, CHROM:POS, CHROM:FROM-TO, CHROM:FROM- (1-based, inclusive) -int regidx_parse_vcf(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*); - -/* - * regidx_init() - creates new index - * regidx_init_string() - creates new index, from a string rather than from a file - * - * @param fname: input file name or NULL if regions will be added one-by-one via regidx_insert() - * @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL, - * the format will be autodected, currently either regidx_parse_tab (the default) or - * regidx_parse_bed (file must be named 'bed' or 'bed.gz') will be used. Note that - * the exact autodetection algorithm will change. - * @param freef: NULL or see description of regidx_parse_f - * @param payload_size: 0 with regidx_parse_bed, regidx_parse_tab or see regidx_parse_f - * @param usr: optional user data passed to regidx_parse_f - * - * Returns index on success or NULL on error. - */ -regidx_t *regidx_init(const char *fname, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr); -regidx_t *regidx_init_string(const char *string, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr); - -/* - * regidx_destroy() - free memory allocated by regidx_init - */ -void regidx_destroy(regidx_t *idx); - -/* - * regidx_overlap() - check overlap of the location chr:from-to with regions - * @param beg,end: 0-based start, end coordinate (inclusive) - * @param itr: pointer to iterator, can be NULL if regidx_loop not needed - * - * Returns 0 if there is no overlap or 1 if overlap is found. The overlapping - * regions can be iterated as shown in the example above. - */ -int regidx_overlap(regidx_t *idx, const char *chr, uint32_t beg, uint32_t end, regitr_t *itr); - -/* - * regidx_insert() - add a new region. - * regidx_insert_list() - add new regions from a list - * regidx_push() - low level insertion of a new region - * - * Returns 0 on success or -1 on error. - */ -int regidx_insert(regidx_t *idx, char *line); -int regidx_insert_list(regidx_t *idx, char *line, char delim); -int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, uint32_t beg, uint32_t end, void *payload); - -/* - * regidx_seq_names() - return list of all sequence names - */ -char **regidx_seq_names(regidx_t *idx, int *n); - -/* - * regidx_seq_nregs() - number of regions - * regidx_nregs() - total number of regions - */ -int regidx_seq_nregs(regidx_t *idx, const char *seq); -int regidx_nregs(regidx_t *idx); - -/* - * regitr_init() - initialize an iterator. The idx parameter is required only - * with regitr_loop. If only regitr_overlap is called, NULL - * can be given. - * - * regitr_reset() - initialize an iterator for a repeated regitr_loop cycle. - * Not required with regitr_overlap. - */ -regitr_t *regitr_init(regidx_t *idx); -void regitr_destroy(regitr_t *itr); -void regitr_reset(regidx_t *idx, regitr_t *itr); - -/* - * regitr_overlap() - next overlapping region - * Returns 0 when done or 1 when itr is set to next region - */ -int regitr_overlap(regitr_t *itr); - -/* - * regitr_loop() - loop over all regions - * Returns 0 when done or 1 when itr is set to next region - */ -int regitr_loop(regitr_t *itr); - -/* - * regitr_copy() - create a copy of an iterator for a repeated iteration with regitr_loop - */ -void regitr_copy(regitr_t *dst, regitr_t *src); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/test/test-regidx.c b/test/test-regidx.c deleted file mode 100644 index 4d44ea4bf..000000000 --- a/test/test-regidx.c +++ /dev/null @@ -1,392 +0,0 @@ -/* test/test-regidx.c -- Regions index test harness. - - gcc -g -Wall -O0 -I. -I../htslib/ -L../htslib regidx.c -o test-regidx test-regidx.c -lhts - - Copyright (C) 2014 Genome Research Ltd. - - Author: Petr Danecek - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "regidx.h" - -static int verbose = 0; - -void debug(const char *format, ...) -{ - if ( verbose<2 ) return; - va_list ap; - va_start(ap, format); - vfprintf(stderr, format, ap); - va_end(ap); -} -void info(const char *format, ...) -{ - if ( verbose<1 ) return; - va_list ap; - va_start(ap, format); - vfprintf(stderr, format, ap); - va_end(ap); -} -void error(const char *format, ...) -{ - va_list ap; - va_start(ap, format); - vfprintf(stderr, format, ap); - va_end(ap); - exit(-1); -} - -int custom_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) -{ - // Use the standard parser for CHROM,FROM,TO - int i, ret = regidx_parse_tab(line,chr_beg,chr_end,beg,end,NULL,NULL); - if ( ret!=0 ) return ret; - - // Skip the fields that were parsed above - char *ss = (char*) line; - while ( *ss && isspace(*ss) ) ss++; - for (i=0; i<3; i++) - { - while ( *ss && !isspace(*ss) ) ss++; - if ( !*ss ) return -2; // wrong number of fields - while ( *ss && isspace(*ss) ) ss++; - } - if ( !*ss ) return -2; - - // Parse the payload - char *se = ss; - while ( *se && !isspace(*se) ) se++; - char **dat = (char**) payload; - *dat = (char*) malloc(se-ss+1); - memcpy(*dat,ss,se-ss+1); - (*dat)[se-ss] = 0; - return 0; -} -void custom_free(void *payload) -{ - char **dat = (char**)payload; - free(*dat); -} - -void test_sequential_access(void) -{ - // Init index with no file name, we will insert the regions manually - regidx_t *idx = regidx_init(NULL,custom_parse,custom_free,sizeof(char*),NULL); - if ( !idx ) error("init failed\n"); - - // Insert regions - kstring_t str = {0,0,0}; - int i, n = 10; - for (i=0; ibeg!=itr->end || itr->beg+1!=10*(i+1) ) error("listing failed, expected %d, found %d\n",10*(i+1),itr->beg+1); - str.l = 0; - ksprintf(&str,"%d",itr->beg+1); - if ( strcmp(regitr_payload(itr,char*),str.s) ) error("listing failed, expected payload \"%s\", found \"%s\"\n",str.s,regitr_payload(itr,char*)); - i++; - } - if ( i!=n ) error("Expected %d regions, listed %d\n", n,i); - debug("ok: listed %d regions\n", n); - - // Clean up - regitr_destroy(itr); - regidx_destroy(idx); - free(str.s); -} - -void test_custom_payload(void) -{ - // Init index with no file name, we will insert the regions manually - regidx_t *idx = regidx_init(NULL,custom_parse,custom_free,sizeof(char*),NULL); - if ( !idx ) error("init failed\n"); - - // Insert regions - char *line; - line = "1 10000000 10000000 1:10000000-10000000"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); - line = "1 20000000 20000001 1:20000000-20000001"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); - line = "1 20000002 20000002 1:20000002-20000002"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); - line = "1 30000000 30000000 1:30000000-30000000"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); - - // Test - regitr_t *itr = regitr_init(idx); - int from, to; - - from = to = 10000000; - if ( !regidx_overlap(idx,"1",from-1,to-1,itr) ) error("query failed: 1:%d-%d\n",from,to); - if ( strcmp("1:10000000-10000000",regitr_payload(itr,char*)) ) error("query failed: 1:%d-%d vs %s\n", from,to,regitr_payload(itr,char*)); - if ( !regidx_overlap(idx,"1",from-2,to-1,itr) ) error("query failed: 1:%d-%d\n",from-1,to); - if ( !regidx_overlap(idx,"1",from-2,to+3,itr) ) error("query failed: 1:%d-%d\n",from-1,to+2); - if ( regidx_overlap(idx,"1",from-2,to-2,itr) ) error("query failed: 1:%d-%d\n",from-1,to-1); - - from = to = 20000000; - if ( !regidx_overlap(idx,"1",from-1,to-1,itr) ) error("query failed: 1:%d-%d\n",from,to); - - from = to = 20000002; - if ( !regidx_overlap(idx,"1",from-1,to-1,itr) ) error("query failed: 1:%d-%d\n",from,to); - - from = to = 30000000; - if ( !regidx_overlap(idx,"1",from-1,to-1,itr) ) error("query failed: 1:%d-%d\n",from,to); - - // Clean up - regitr_destroy(itr); - regidx_destroy(idx); -} - -void get_random_region(uint32_t min, uint32_t max, uint32_t *beg, uint32_t *end) -{ - long int b = random(), e = random(); - *beg = min + (float)b * (max-min) / RAND_MAX; - *end = *beg + (float)e * (max-*beg) / RAND_MAX; -} - -void test_random(int nregs, uint32_t min, uint32_t max) -{ - min--; - max--; - - // Init index with no file name, we will insert the regions manually - regidx_t *idx = regidx_init(NULL,custom_parse,custom_free,sizeof(char*),NULL); - if ( !idx ) error("init failed\n"); - - // Test region - uint32_t beg,end; - get_random_region(min,max,&beg,&end); - - // Insert regions - int i, nexp = 0; - kstring_t str = {0,0,0}; - for (i=0; i=beg && b<=end ) nexp++; - } - - // Test - regitr_t *itr = regitr_init(idx); - int nhit = 0, ret = regidx_overlap(idx,"1",beg,end,itr); - if ( nexp && !ret ) error("query failed, expected %d overlap(s), found none: %d-%d\n", nexp,beg+1,end+1); - if ( !nexp && ret ) error("query failed, expected no overlaps, found some: %d-%d\n", beg+1,end+1); - while ( ret && regitr_overlap(itr) ) - { - str.l = 0; - ksprintf(&str,"1:%"PRIu32"-%"PRIu32"",itr->beg+1,itr->end+1); - if ( strcmp(str.s,regitr_payload(itr,char*)) ) - error("query failed, incorrect payload: %s vs %s (%d-%d)\n",str.s,regitr_payload(itr,char*),beg+1,end+1); - if ( itr->beg > end || itr->end < beg ) - error("query failed, incorrect hit: %d-%d vs %d-%d, payload %s\n", beg+1,end+1,itr->beg+1,itr->end+1,regitr_payload(itr,char*)); - nhit++; - } - if ( nexp!=nhit ) error("query failed, expected %d overlap(s), found %d: %d-%d\n",nexp,nhit,beg+1,end+1); - debug("ok: found %d overlaps\n", nexp); - - // Clean up - regitr_destroy(itr); - regidx_destroy(idx); - free(str.s); -} - -void create_line_bed(char *line, char *chr, int start, int end) -{ - sprintf(line,"%s\t%d\t%d\n",chr,start-1,end); -} -void create_line_tab(char *line, char *chr, int start, int end) -{ - sprintf(line,"%s\t%d\t%d\n",chr,start,end); -} -void create_line_reg(char *line, char *chr, int start, int end) -{ - sprintf(line,"%s:%d-%d\n",chr,start,end); -} - -typedef void (*set_line_f)(char *line, char *chr, int start, int end); - -void test(set_line_f set_line, regidx_parse_f parse) -{ - regidx_t *idx = regidx_init(NULL,parse,NULL,0,NULL); - if ( !idx ) error("init failed\n"); - - char line[250], *chr = "1"; - int i, n = 10, start, end, nhit; - for (i=1; ibeg > end-1 || itr->end < start-1 ) error("query failed, incorrect region: %d-%d for %d-%d\n",itr->beg+1,itr->end+1,start,end); - debug("\t %d-%d\n",itr->beg+1,itr->end+1); - nhit++; - } - if ( nhit!=1 ) error("query failed, expected one hit, found %d: %s:%d-%d\n",nhit,chr,start,end); - - - // one hit - start = end = 10*i+1; - if ( !regidx_overlap(idx,chr,start-1,end-1,itr) ) error("query failed, there should be a hit: %s:%d-%d\n",chr,start,end); - debug("ok: overlap(s) found for %s:%d-%d\n",chr,start,end); - nhit = 0; - while ( regitr_overlap(itr) ) - { - if ( itr->beg > end-1 || itr->end < start-1 ) error("query failed, incorrect region: %d-%d for %d-%d\n",itr->beg+1,itr->end+1,start,end); - debug("\t %d-%d\n",itr->beg+1,itr->end+1); - nhit++; - } - if ( nhit!=1 ) error("query failed, expected one hit, found %d: %s:%d-%d\n",nhit,chr,start,end); - - - // two hits - start = 10*i; end = start+1; - if ( !regidx_overlap(idx,chr,start-1,end-1,itr) ) error("query failed, there should be a hit: %s:%d-%d\n",chr,start,end); - debug("ok: overlap(s) found for %s:%d-%d\n",chr,start,end); - nhit = 0; - while ( regitr_overlap(itr) ) - { - if ( itr->beg > end-1 || itr->end < start-1 ) error("query failed, incorrect region: %d-%d for %d-%d\n",itr->beg+1,itr->end+1,start,end); - debug("\t %d-%d\n",itr->beg+1,itr->end+1); - nhit++; - } - if ( nhit!=2 ) error("query failed, expected two hits, found %d: %s:%d-%d\n",nhit,chr,start,end); - - // fully contained interval, one hit - start = 20000*i - 5000; end = 20000*i + 3000; - set_line(line,chr,start,end); - if ( !regidx_overlap(idx,chr,start-1,end-1,itr) ) error("query failed, there should be a hit: %s:%d-%d\n",chr,start,end); - debug("ok: overlap(s) found for %s:%d-%d\n",chr,start,end); - nhit = 0; - while ( regitr_overlap(itr) ) - { - if ( itr->beg > end-1 || itr->end < start-1 ) error("query failed, incorrect region: %d-%d for %d-%d\n",itr->beg+1,itr->end+1,start,end); - debug("\t %d-%d\n",itr->beg+1,itr->end+1); - nhit++; - } - if ( nhit!=1 ) error("query failed, expected one hit, found %d: %s:%d-%d\n",nhit,chr,start,end); - } - regitr_destroy(itr); - regidx_destroy(idx); -} - -static void usage(void) -{ - fprintf(stderr, "Usage: test-regidx [OPTIONS]\n"); - fprintf(stderr, "Options:\n"); - fprintf(stderr, " -h, --help this help message\n"); - fprintf(stderr, " -s, --seed random seed\n"); - fprintf(stderr, " -v, --verbose increase verbosity by giving multiple times\n"); - - exit(1); -} - -int main(int argc, char **argv) -{ - static struct option loptions[] = - { - {"help",0,0,'h'}, - {"verbose",0,0,'v'}, - {"seed",1,0,'s'}, - {0,0,0,0} - }; - int c; - int seed = (int)time(NULL); - while ((c = getopt_long(argc, argv, "hvs:",loptions,NULL)) >= 0) - { - switch (c) - { - case 's': seed = atoi(optarg); break; - case 'v': verbose++; break; - default: usage(); break; - } - } - - info("Testing sequential access\n"); - test_sequential_access(); - - info("Testing TAB\n"); - test(create_line_tab,regidx_parse_tab); - - info("Testing REG\n"); - test(create_line_reg,regidx_parse_reg); - - info("Testing BED\n"); - test(create_line_bed,regidx_parse_bed); - - info("Testing custom payload\n"); - test_custom_payload(); - - int i, ntest = 1000, nreg = 50; - srandom(seed); - info("%d randomized tests, %d regions per test. Random seed is %d\n", ntest,nreg,seed); - for (i=0; i #include #include "bcftools.h" -#include "regidx.h" +#include #include "vcmp.h" #define DBG 0