From 888c0240f6f95156b66e307e9fb777298e5be247 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Fri, 30 Sep 2022 16:13:28 +0200 Subject: [PATCH 1/6] Backup commit only, nothing is working, long way to go --- Makefile | 7 +- main.c | 5 + mpileup.c | 2 +- mpileup2.c | 622 +++++++++++++++++++++++++++++++++++++++++++++ mpileup2/mpileup.c | 47 ++++ mpileup2/mpileup.h | 69 +++++ 6 files changed, 749 insertions(+), 3 deletions(-) create mode 100644 mpileup2.c create mode 100644 mpileup2/mpileup.c create mode 100644 mpileup2/mpileup.h diff --git a/Makefile b/Makefile index f318386f1..4811b4851 100644 --- a/Makefile +++ b/Makefile @@ -40,9 +40,10 @@ OBJS = main.o vcfindex.o tabix.o \ vcfcall.o mcall.o vcmp.o gvcf.o reheader.o convert.o vcfconvert.o tsv2vcf.o \ vcfcnv.o vcfhead.o HMM.o consensus.o ploidy.o bin.o hclust.o version.o \ regidx.o smpl_ilist.o csq.o vcfbuf.o \ - mpileup.o bam2bcf.o bam2bcf_indel.o bam_sample.o \ + mpileup.o mpileup2.o bam2bcf.o bam2bcf_indel.o bam_sample.o \ vcfsort.o cols.o extsort.o dist.o abuf.o \ - ccall.o em.o prob1.o kmin.o str_finder.o + ccall.o em.o prob1.o kmin.o str_finder.o \ + mpileup2/mpileup.o PLUGIN_OBJS = vcfplugin.o prefix = /usr/local @@ -277,6 +278,8 @@ cols.o: cols.c cols.h regidx.o: regidx.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) regidx.h consensus.o: consensus.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) regidx.h $(bcftools_h) rbuf.h $(filter_h) mpileup.o: mpileup.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(htslib_hts_os_h) regidx.h $(bcftools_h) $(bam2bcf_h) $(bam_sample_h) $(gvcf_h) +mpileup2.o: mpileup2.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(htslib_hts_os_h) $(bcftools_h) mpileup2/mpileup.h +mpileup2/mpileup.o: mpileup2/mpileup.h mpileup2/mpileup.c bam2bcf.o: bam2bcf.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(bam2bcf_h) mw.h bam2bcf_indel.o: bam2bcf_indel.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(htslib_ksort_h) str_finder.h bam_sample.o: bam_sample.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(khash_str2str_h) $(bam_sample_h) $(bcftools_h) diff --git a/main.c b/main.c index 3a0d557b7..a7c393dc3 100644 --- a/main.c +++ b/main.c @@ -61,6 +61,7 @@ int count_plugins(void); int main_consensus(int argc, char *argv[]); int main_csq(int argc, char *argv[]); int main_mpileup(int argc, char *argv[]); +int main_mpileup2(int argc, char *argv[]); int main_sort(int argc, char *argv[]); typedef struct @@ -174,6 +175,10 @@ static cmd_t cmds[] = .alias = "mpileup", .help = "multi-way pileup producing genotype likelihoods" }, + { .func = main_mpileup2, + .alias = "mpileup2", + .help = "-new version of mpileup (experimental)" // do not advertise yet + }, #if USE_GPL { .func = main_polysomy, .alias = "polysomy", diff --git a/mpileup.c b/mpileup.c index fc4f4b130..8c29a3087 100644 --- a/mpileup.c +++ b/mpileup.c @@ -1045,7 +1045,7 @@ int read_file_list(const char *file_list,int *n,char **argv[]) } #undef MAX_PATH_LEN -int parse_format_flag(const char *str) +static int parse_format_flag(const char *str) { int i, flag = 0, n_tags; char **tags = hts_readlist(str, 0, &n_tags); diff --git a/mpileup2.c b/mpileup2.c new file mode 100644 index 000000000..a2485345e --- /dev/null +++ b/mpileup2.c @@ -0,0 +1,622 @@ +/* mpileup2.c -- mpileup2, new version of mpileup + + Copyright (C) 2022 Genome Research Ltd. + + Author: Petr Danecek + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + + +/* + +Can't decide how much of the functionality should stay in bcftools and what should be moved to htslib and +controlled via mpileup_set_option() + - list BAM files so that cumbersome nplp,plp,nsmpl,files,nfiles variables can be avoided on user side; + check the BAM header consistency (this is currently not done by mpileup) + - list of read groups to include or exclude, possibly with sample renaming, as in mpileup -G + - list of samples to include or exclude, as in mpileup -S + - targets and regions, as in mpileup -R/-T/-r/-t + +*/ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bcftools.h" +#include "mpileup2/mpileup.h" +#include "bam2bcf.h" +#include "gvcf.h" + +#define MPLP_BCF 1 +#define MPLP_VCF (1<<1) +#define MPLP_NO_COMP (1<<2) +#define MPLP_NO_ORPHAN (1<<3) +#define MPLP_REALN (1<<4) +#define MPLP_NO_INDEL (1<<5) +#define MPLP_REDO_BAQ (1<<6) +#define MPLP_ILLUMINA13 (1<<7) +#define MPLP_IGNORE_RG (1<<8) +#define MPLP_PRINT_POS (1<<9) +#define MPLP_PRINT_MAPQ (1<<10) +#define MPLP_PER_SAMPLE (1<<11) +#define MPLP_SMART_OVERLAPS (1<<12) +#define MPLP_REALN_PARTIAL (1<<13) + +typedef struct +{ + int argc; + char **argv, *output_fname; + int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, flag, fmt_flag; + int output_type, clevel, max_read_len; + int record_cmd_line; + mpileup_t *mplp; + bcf_callaux_t bca; + gvcf_t *gvcf; +} +args_t; + + +// Data shared by all bam files +typedef struct { + int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth, + max_indel_depth, max_read_len, fmt_flag, ambig_reads; + int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type; + int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels + double min_frac; // for indels + double indel_bias; + char *reg_fname, *pl_list, *fai_fname, *output_fname; + int reg_is_file, record_cmd_line, n_threads, clevel; + int bed_logic; // 1: include region, 0: exclude region + + // auxiliary structures for calling + kstring_t buf; + bcf1_t *bcf_rec; + htsFile *bcf_fp; + bcf_hdr_t *bcf_hdr; + int argc; + char **argv; +} mplp_conf_t; + +static int parse_format_flag(const char *str) +{ + int i, flag = 0, n_tags; + char **tags = hts_readlist(str, 0, &n_tags); + for(i=0; irflag_skip_all_set); + char *tmp_skip_any_unset = bam_flag2str(mplp->rflag_skip_any_unset); + char *tmp_skip_all_unset = bam_flag2str(mplp->rflag_skip_all_unset); + char *tmp_skip_any_set = bam_flag2str(mplp->rflag_skip_any_set); + + fprintf(fp, + "\n" + "Usage: bcftools mpileup2 [OPTIONS] in1.bam [in2.bam [...]]\n" + "\n" + "Input options:\n" + " -6, --illumina1.3+ Quality is in the Illumina-1.3+ encoding\n" + " -A, --count-orphans Do not discard anomalous read pairs\n" + " -b, --bam-list FILE List of input BAM filenames, one per line\n" + " -B, --no-BAQ Disable BAQ (per-Base Alignment Quality)\n" + " -C, --adjust-MQ INT Adjust mapping quality [0]\n" + " -D, --full-BAQ Apply BAQ everywhere, not just in problematic regions\n" + " -d, --max-depth INT Max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); + fprintf(fp, + " -E, --redo-BAQ Recalculate BAQ on the fly, ignore existing BQs\n" + " -f, --fasta-ref FILE Faidx indexed reference sequence file\n" + " --no-reference Do not require fasta reference file\n" + " -G, --read-groups FILE Select or exclude read groups listed in the file\n" + " -q, --min-MQ INT Skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq); + fprintf(fp, + " -Q, --min-BQ INT Skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ); + fprintf(fp, + " --max-BQ INT Limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ); + fprintf(fp, + " --delta-BQ INT Use neighbour_qual + INT if less than qual [%d]\n", mplp->delta_baseQ); + fprintf(fp, + " -r, --regions REG[,...] Comma separated list of regions in which pileup is generated\n" + " -R, --regions-file FILE Restrict to regions listed in a file\n" + " --ignore-RG Ignore RG tags (one BAM = one sample)\n" + " --ls, --skip-all-set STR|INT Skip reads with all of the bits set []\n"); + fprintf(fp, + " --ns, --skip-any-set STR|INT Skip reads with any of the bits set [%s]\n", tmp_skip_any_set); + fprintf(fp, + " --lu, --skip-all-unset STR|INT Skip reads with all of the bits unset []\n" + " --nu, --skip-any-unset STR|INT Skip reads with any of the bits unset []\n"); + fprintf(fp, + " -s, --samples LIST Comma separated list of samples to include\n" + " -S, --samples-file FILE File of samples to include\n" + " -t, --targets REG[,...] Similar to -r but streams rather than index-jumps\n" + " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" + " -x, --ignore-overlaps Disable read-pair overlap detection\n" + " --seed INT Random number seed used for sampling deep regions [0]\n" + "\n" + "Output options:\n" + " -a, --annotate LIST Optional tags to output; '\\?' to list available tags []\n" + " -g, --gvcf INT[,...] Group non-variant sites into gVCF blocks according\n" + " To minimum per-sample DP\n" + " --no-version Do not append version and command line to the header\n" + " -o, --output FILE Write output to FILE [standard output]\n" + " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" + " 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n" + " -U, --mwu-u Use older probability scale for Mann-Whitney U test\n" + " --threads INT Use multithreading with INT worker threads [0]\n" + "\n" + "SNP/INDEL genotype likelihoods options:\n" + " -X, --config STR Specify platform specific profiles (see below)\n" + " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ); + fprintf(fp, + " -F, --gap-frac FLOAT Minimum fraction of gapped reads [%g]\n", mplp->min_frac); + fprintf(fp, + " -h, --tandem-qual INT Coefficient for homopolymer errors [%d]\n", mplp->tandemQ); + fprintf(fp, + " -I, --skip-indels Do not perform indel calling\n" + " -L, --max-idepth INT Maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth); + fprintf(fp, + " -m, --min-ireads INT Minimum number gapped reads for indel candidates [%d]\n", mplp->min_support); + fprintf(fp, + " -M, --max-read-len INT Maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len); + fprintf(fp, + " -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ); + fprintf(fp, + " -p, --per-sample-mF Apply -m and -F per-sample for increased sensitivity\n" + " -P, --platforms STR Comma separated list of platforms for indels [all]\n" + " --ar, --ambig-reads STR What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n"); + fprintf(fp, + " --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias); + fprintf(fp, + " --indel-size INT Approximate maximum indel size considered [%d]\n", mplp->indel_win_size); + fprintf(fp,"\n"); + fprintf(fp, + "Configuration profiles activated with -X, --config:\n" + " 1.12: -Q13 -h100 -m1 -F0.002\n" + " illumina: [ default values ]\n" + " ont: -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n" + " pacbio-ccs: -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 -M99999\n" + "\n" + "Notes: Assuming diploid individuals.\n" + "\n" + "Example:\n" + " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n" + " bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n" + "\n"); + + free(tmp_skip_all_set); + free(tmp_skip_any_unset); + free(tmp_skip_all_unset); + free(tmp_skip_any_set); +} + +int main_mpileup2(int argc, char *argv[]) +{ + int c, ret = 0; + args_t *args = (args_t*) calloc(1,sizeof(args_t)); + args->argc = argc; args->argv = argv; + args->mplp = mpileup_init(); + args->rflag_skip_any_set = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; + args->flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL | MPLP_SMART_OVERLAPS; + args->bca.openQ = 40; + + int noref = 0; + +#if 0 + const char *file_list = NULL; + char **fn = NULL; + int nfiles = 0, use_orphan = 0, noref = 0; + mplp_conf_t mplp; + memset(&mplp, 0, sizeof(mplp_conf_t)); + mplp.min_baseQ = 1; + mplp.max_baseQ = 60; + mplp.delta_baseQ = 30; + mplp.capQ_thres = 0; + mplp.max_depth = 250; mplp.max_indel_depth = 250; + mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 500; + mplp.min_frac = 0.05; mplp.indel_bias = 1.0; mplp.min_support = 2; + mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL + | MPLP_SMART_OVERLAPS; + mplp.argc = argc; mplp.argv = argv; + mplp.rflag_skip_any_set = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; + mplp.output_fname = NULL; + mplp.output_type = FT_VCF; + mplp.record_cmd_line = 1; + mplp.n_threads = 0; + // the default to be changed in future, see also parse_format_flag() + //mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE; + //mplp.ambig_reads = B2B_DROP; + mplp.max_read_len = 500; + mplp.indel_win_size = 110; + mplp.clevel = -1; + hts_srand48(0); +#endif + + static const struct option lopts[] = + { + {"nu", required_argument, NULL, 16}, + {"lu", required_argument, NULL, 17}, + {"rf", required_argument, NULL, 17}, // old --rf, --incl-flags = --lu, --skip-all-unset + {"ns", required_argument, NULL, 18}, + {"ff", required_argument, NULL, 18}, // old --ff, --excl-flags = --ns, --skip-any-set + {"ls", required_argument, NULL, 19}, + {"skip-any-unset", required_argument, NULL, 16}, + {"skip-all-unset", required_argument, NULL, 17}, + {"skip-any-set", required_argument, NULL, 18}, + {"skip-all-set", required_argument, NULL, 19}, + {"output", required_argument, NULL, 3}, + {"open-prob", required_argument, NULL, 4}, + {"ignore-RG", no_argument, NULL, 5}, + {"ignore-rg", no_argument, NULL, 5}, + {"gvcf", required_argument, NULL, 'g'}, + {"no-reference", no_argument, NULL, 7}, + {"no-version", no_argument, NULL, 8}, + {"threads",required_argument,NULL,9}, + {"illumina1.3+", no_argument, NULL, '6'}, + {"count-orphans", no_argument, NULL, 'A'}, + {"bam-list", required_argument, NULL, 'b'}, + {"no-BAQ", no_argument, NULL, 'B'}, + {"no-baq", no_argument, NULL, 'B'}, + {"full-BAQ", no_argument, NULL, 'D'}, + {"full-baq", no_argument, NULL, 'D'}, + {"adjust-MQ", required_argument, NULL, 'C'}, + {"adjust-mq", required_argument, NULL, 'C'}, + {"max-depth", required_argument, NULL, 'd'}, + {"redo-BAQ", no_argument, NULL, 'E'}, + {"redo-baq", no_argument, NULL, 'E'}, + {"fasta-ref", required_argument, NULL, 'f'}, + {"read-groups", required_argument, NULL, 'G'}, + {"region", required_argument, NULL, 'r'}, + {"regions", required_argument, NULL, 'r'}, + {"regions-file", required_argument, NULL, 'R'}, + {"targets", required_argument, NULL, 't'}, + {"targets-file", required_argument, NULL, 'T'}, + {"min-MQ", required_argument, NULL, 'q'}, + {"min-mq", required_argument, NULL, 'q'}, + {"min-BQ", required_argument, NULL, 'Q'}, + {"min-bq", required_argument, NULL, 'Q'}, + {"max-bq", required_argument, NULL, 11}, + {"max-BQ", required_argument, NULL, 11}, + {"delta-BQ", required_argument, NULL, 12}, + {"ignore-overlaps", no_argument, NULL, 'x'}, + {"output-type", required_argument, NULL, 'O'}, + {"samples", required_argument, NULL, 's'}, + {"samples-file", required_argument, NULL, 'S'}, + {"annotate", required_argument, NULL, 'a'}, + {"ext-prob", required_argument, NULL, 'e'}, + {"gap-frac", required_argument, NULL, 'F'}, + {"indel-bias", required_argument, NULL, 10}, + {"indel-size", required_argument, NULL, 15}, + {"tandem-qual", required_argument, NULL, 'h'}, + {"skip-indels", no_argument, NULL, 'I'}, + {"max-idepth", required_argument, NULL, 'L'}, + {"min-ireads", required_argument, NULL, 'm'}, + {"per-sample-mF", no_argument, NULL, 'p'}, + {"per-sample-mf", no_argument, NULL, 'p'}, + {"platforms", required_argument, NULL, 'P'}, + {"max-read-len", required_argument, NULL, 'M'}, + {"config", required_argument, NULL, 'X'}, + {"mwu-u", no_argument, NULL, 'U'}, + {"seed", required_argument, NULL, 13}, + {"ambig-reads", required_argument, NULL, 14}, + {"ar", required_argument, NULL, 14}, + {NULL, 0, NULL, 0} + }; + while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) { + switch (c) { + case 'x': mpileup_set_opt(args->mplp, int, SMART_OVERLAPS, 0); break; + case 16 : + args->rflag_skip_any_unset = bam_str2flag(optarg); + if ( args->rflag_skip_any_unset <0 ) { fprintf(stderr,"Could not parse --nf %s\n", optarg); return 1; } + break; + case 17 : + args->rflag_skip_all_unset = bam_str2flag(optarg); + if ( args->rflag_skip_all_unset<0 ) { fprintf(stderr,"Could not parse --if %s\n", optarg); return 1; } + break; + case 18 : + args->rflag_skip_any_set = bam_str2flag(optarg); + if ( args->rflag_skip_any_set <0 ) { fprintf(stderr,"Could not parse --ef %s\n", optarg); return 1; } + break; + case 19 : + args->rflag_skip_all_set = bam_str2flag(optarg); + if ( args->rflag_skip_all_set <0 ) { fprintf(stderr,"Could not parse --df %s\n", optarg); return 1; } + break; + case 3 : args->output_fname = optarg; break; + case 4 : args->bca.openQ = atoi(optarg); break; + case 5 : mpileup_set_opt(args->mplp, int, SMPL_IGNORE_RG, 1); break; + case 'g': + args->gvcf = gvcf_init(optarg); + if ( !args->gvcf ) error("Could not parse: --gvcf %s\n", optarg); + break; + case 'f': mpileup_set_opt(args->mplp, char*, FASTA_REF, optarg); break; + case 7 : noref = 1; break; + case 8 : args->record_cmd_line = 0; break; + case 9 : error("todo: --threads\n"); /* note this was used *only* for output VCF compression, practically useless, moreover -Ou is recommended... */ break; + case 'd': mpileup_set_opt(args->mplp, int, MAX_DP_PER_FILE, atoi(optarg)); break; + case 'r': mpileup_set_opt(args->mplp, char*, REGIONS, optarg); break; + case 'R': mpileup_set_opt(args->mplp, char*, REGIONS_FNAME, optarg); break; + case 't': mpileup_set_opt(args->mplp, char*, TARGETS, optarg); + // // In the original version the whole BAM was streamed which is inefficient + // // with few BED intervals and big BAMs. Todo: devise a heuristic to determine + // // best strategy, that is streaming or jumping. + // if ( optarg[0]=='^' ) optarg++; + // else mplp.bed_logic = 1; + // mplp.bed = regidx_init(NULL,regidx_parse_reg,NULL,0,NULL); + // mplp.bed_itr = regitr_init(mplp.bed); + // if ( regidx_insert_list(mplp.bed,optarg,',') !=0 ) + // { + // fprintf(stderr,"Could not parse the targets: %s\n", optarg); + // exit(EXIT_FAILURE); + // } + break; + case 'T': mpileup_set_opt(args->mplp, char*, TARGETS_FNAME, optarg); + // if ( optarg[0]=='^' ) optarg++; + // else mplp.bed_logic = 1; + // mplp.bed = regidx_init(optarg,NULL,NULL,0,NULL); + // if (!mplp.bed) { fprintf(stderr, "bcftools mpileup: Could not read file \"%s\"", optarg); return 1; } + break; + case 'P': error("todo: --platforms\n"); /* this was never used: mplp.pl_list = strdup(optarg); */ break; + case 'p': args->flag |= MPLP_PER_SAMPLE; break; + case 'B': args->flag &= ~MPLP_REALN; break; + case 'D': args->flag &= ~MPLP_REALN_PARTIAL; break; + case 'I': args->flag |= MPLP_NO_INDEL; break; + case 'E': args->flag |= MPLP_REDO_BAQ; break; + case '6': args->flag |= MPLP_ILLUMINA13; break; + case 's': mpileup_set_opt(args->mplp, char*, SAMPLES, optarg); break; + case 'S': mpileup_set_opt(args->mplp, char*, SAMPLES_FNAME, optarg); break; + case 'O': + switch (optarg[0]) { + case 'b': args->output_type = FT_BCF_GZ; break; + case 'u': args->output_type = FT_BCF; break; + case 'z': args->output_type = FT_VCF_GZ; break; + case 'v': args->output_type = FT_VCF; break; + default: + { + char *tmp; + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + char *tmp; + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --output-type %s\n", optarg+1); + } + break; + case 'C': mpileup_set_opt(args->mplp, int, ADJUST_MQ, atoi(optarg)); break; + case 'q': mpileup_set_opt(args->mplp, int, MIN_MQ, atoi(optarg)); break; + case 'Q': mpileup_set_opt(args->mplp, int, MIN_BQ, atoi(optarg)); break; + case 11: mpileup_set_opt(args->mplp, int, MAX_BQ, atoi(optarg)); break; + case 12: mpileup_set_opt(args->mplp, int, DELTA_BQ, atoi(optarg)); break; + case 'b': mpileup_set_opt(args->mplp, char*, BAM_FNAME, optarg); break; + case 'o': + /* this option used to be -o INT and -o FILE. In the new code disable -o as an alias of --open-prob as its use is very rare */ + args->output_fname = optarg; + break; + case 'e': args->bca.extQ = atoi(optarg); break; + case 'h': args->bca.tandemQ = atoi(optarg); break; + case 10: // --indel-bias (inverted so higher => more indels called) + if (atof(optarg) < 1e-2) + args->bca.indel_bias = 1/1e2; + else + args->bca.indel_bias = 1/atof(optarg); + break; + case 15: { + char *tmp; + args->bca.indel_win_size = strtol(optarg,&tmp,10); + if ( *tmp ) error("Could not parse argument: --indel-size %s\n", optarg); + if ( args->bca.indel_win_size < 110 ) + { + args->bca.indel_win_size = 110; + fprintf(stderr,"Warning: running with --indel-size %d, the requested value is too small\n",args->bca.indel_win_size); + } + } + break; + case 'A': args->flag &= ~MPLP_NO_ORPHAN; break; + case 'F': args->bca.min_frac = atof(optarg); break; + case 'm': args->bca.min_support = atoi(optarg); break; + case 'L': error("todo: -L\n"); /* mplp.max_indel_depth = atoi(optarg); */ break; + case 'G': mpileup_set_opt(args->mplp, char*, READ_GROUPS_FNAME, optarg); break; + case 'a': + if (optarg[0]=='?') { + list_annotations(stderr); + return 1; + } + args->bca.fmt_flag |= parse_format_flag(optarg); + break; + case 'M': args->max_read_len = atoi(optarg); break; + case 'U': args->fmt_flag &= ~B2B_INFO_ZSCORE; break; + case 'X': + if (strcasecmp(optarg, "pacbio-ccs") == 0) { + args->bca.min_frac = 0.1; + args->bca.min_baseQ = 5; + args->bca.max_baseQ = 50; + args->bca.delta_baseQ = 10; + args->bca.openQ = 25; + args->bca.extQ = 1; + args->flag |= MPLP_REALN_PARTIAL; + args->max_read_len = 99999; + } else if (strcasecmp(optarg, "ont") == 0) { + fprintf(stderr, "For ONT it may be beneficial to also run bcftools call with " + "a higher -P, eg -P0.01 or -P 0.1\n"); + args->bca.min_baseQ = 5; + args->bca.max_baseQ = 30; + args->flag &= ~MPLP_REALN; + args->flag |= MPLP_NO_INDEL; + } else if (strcasecmp(optarg, "1.12") == 0) { + // 1.12 and earlier + args->bca.min_frac = 0.002; + args->bca.min_support = 1; + args->bca.min_baseQ = 13; + args->bca.tandemQ = 100; + args->flag &= ~MPLP_REALN_PARTIAL; + args->flag |= MPLP_REALN; + } else if (strcasecmp(optarg, "illumina") == 0) { + args->flag |= MPLP_REALN_PARTIAL; + } else { + fprintf(stderr, "Unknown configuration name '%s'\n" + "Please choose from 1.12, illumina, pacbio-ccs or ont\n", + optarg); + return 1; + } + break; + case 13: hts_srand48(atoi(optarg)); break; + case 14: + error("todo: --ambig-reads\n"); + // if ( !strcasecmp(optarg,"drop") ) .ambig_reads = B2B_DROP; + // else if ( !strcasecmp(optarg,"incAD") ) mplp.ambig_reads = B2B_INC_AD; + // else if ( !strcasecmp(optarg,"incAD0") ) mplp.ambig_reads = B2B_INC_AD0; + // else error("The option to --ambig-reads not recognised: %s\n",optarg); + break; + default: + fprintf(stderr,"Invalid option: '%c'\n", c); + return 1; + } + } + +#if 0 + if ( mplp.gvcf && !(mplp.fmt_flag&B2B_FMT_DP) ) + { + fprintf(stderr,"[warning] The -a DP option is required with --gvcf, switching on.\n"); + mplp.fmt_flag |= B2B_FMT_DP; + } + if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) ) + { + if ( mplp.flag&MPLP_VCF ) + { + if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_VCF; + else mplp.output_type = FT_VCF_GZ; + } + else if ( mplp.flag&MPLP_BCF ) + { + if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_BCF; + else mplp.output_type = FT_BCF_GZ; + } + } + if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ ) + { + fprintf(stderr,"Error: The -B option cannot be combined with -E\n"); + return 1; + } + if (argc == 1) + { + print_usage(stderr, &mplp); + return 1; + } + if (!mplp.fai && !noref) { + fprintf(stderr,"Error: mpileup requires the --fasta-ref option by default; use --no-reference to run without a fasta reference\n"); + return 1; + } + int ret,i; + if (file_list) + { + if ( read_file_list(file_list,&nfiles,&fn) ) return 1; + mplp.files = fn; + mplp.nfiles = nfiles; + } + else + { + mplp.nfiles = argc - optind; + mplp.files = (char**) malloc(mplp.nfiles*sizeof(char*)); + for (i=0; i + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#include +#include "mpileup.h" + +struct _mpileup_t +{ + int x; +}; + +mpileup_t *mpileup_init(void) +{ + return NULL; +} + +void mpileup_set(mpileup_t *mplp, mpileup_opt_t key, void *value) +{ +} + +void mpileup_destroy(mpileup_t *mplp) +{ +} + diff --git a/mpileup2/mpileup.h b/mpileup2/mpileup.h new file mode 100644 index 000000000..4b73edf83 --- /dev/null +++ b/mpileup2/mpileup.h @@ -0,0 +1,69 @@ +/* mileup2.h -- mpileup v2.0 API + + Copyright (C) 2022 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#ifndef __MPILEUP20_H__ +#define __MPILEUP20_H__ + +typedef struct _mpileup_t mpileup_t; + +// Various options +typedef enum +{ + // required + FASTA_REF, // char*, fasta reference + BAM, // char*, add another BAM/CRAM + BAM_FNAME, // char*, read list of BAMs/CRAMs from a file + + // optional + REGIONS, // char*, comma-separated regions + REGIONS_FNAME, // char*, file with a list of regions + TARGETS, + TARGETS_FNAME, + SAMPLES, // char*, comma-separated samples to include + SAMPLES_FNAME, // char*, file of samples to include + READ_GROUPS_FNAME, // char*, read groups to include or exclude, see `bcftools mpileup -G` documentation + + // bit flags + SMART_OVERLAPS, // int {0,1}, 1:disable read-pair overlap detection [0] + SMPL_IGNORE_RG, // int {0,1}, 1:ignore read groups, one bam = one sample [0] + + // numeric parameters + MAX_DP_PER_FILE, // int, maximum depth per file, regardless of the number of samples [250] + ADJUST_MQ, // int, --adjust-MQ value for downgrading reads with excessive mismatches, 0 to disable [0] + MIN_MQ, // int, skip alignments with mapQ smaller than MIN_MQ [0] + MIN_BQ, // int, skip bases with baseQ smaller than MIN_BQ [1] + MAX_BQ, // int, cap BQ for overly optimistics platforms [60] + DELTA_BQ, // int, --delta-BQ value for downgrading high BQ if neighbor_BQ is < BQ - 30 [30] +} +mpileup_opt_t; + +#define mpileup_set_opt(es,type,key,value) { type tmp = value; mpileup_set(es, key, (void*)&tmp); } + +mpileup_t *mpileup_init(void); +void mpileup_set(mpileup_t *mplp, mpileup_opt_t key, void *value); +void mpileup_destroy(mpileup_t *mplp); + +#endif From ed814e3447b926b24bc2717ac73c6befcaef8959 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 23 Nov 2022 17:03:29 +0000 Subject: [PATCH 2/6] Backup commit. Compiles but not functional --- mpileup.c | 16 ------ mpileup2.c | 118 ++++++++++++++++++++++----------------------- mpileup2/mpileup.c | 87 +++++++++++++++++++++++++++++++-- mpileup2/mpileup.h | 10 +++- 4 files changed, 150 insertions(+), 81 deletions(-) diff --git a/mpileup.c b/mpileup.c index e0fea92a8..7241d9ba2 100644 --- a/mpileup.c +++ b/mpileup.c @@ -47,9 +47,6 @@ DEALINGS IN THE SOFTWARE. */ #include "bam_sample.h" #include "gvcf.h" -#define MPLP_BCF 1 -#define MPLP_VCF (1<<1) -#define MPLP_NO_COMP (1<<2) #define MPLP_NO_ORPHAN (1<<3) #define MPLP_REALN (1<<4) #define MPLP_NO_INDEL (1<<5) @@ -1560,19 +1557,6 @@ int main_mpileup(int argc, char *argv[]) fprintf(stderr,"[warning] The -a DP option is required with --gvcf, switching on.\n"); mplp.fmt_flag |= B2B_FMT_DP; } - if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) ) - { - if ( mplp.flag&MPLP_VCF ) - { - if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_VCF; - else mplp.output_type = FT_VCF_GZ; - } - else if ( mplp.flag&MPLP_BCF ) - { - if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_BCF; - else mplp.output_type = FT_BCF_GZ; - } - } if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ ) { fprintf(stderr,"Error: The -B option cannot be combined with -E\n"); diff --git a/mpileup2.c b/mpileup2.c index b0083a1ee..dc8f58e6d 100644 --- a/mpileup2.c +++ b/mpileup2.c @@ -59,9 +59,6 @@ controlled via mpileup_set_option() #include "bam2bcf.h" #include "gvcf.h" -#define MPLP_BCF 1 -#define MPLP_VCF (1<<1) -#define MPLP_NO_COMP (1<<2) #define MPLP_NO_ORPHAN (1<<3) #define MPLP_REALN (1<<4) #define MPLP_NO_INDEL (1<<5) @@ -94,7 +91,6 @@ typedef struct { max_indel_depth, max_read_len, fmt_flag, ambig_reads; int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type; int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels - double min_frac; // for indels double indel_bias; char *reg_fname, *pl_list, *fai_fname, *output_fname; int reg_is_file, record_cmd_line, n_threads, clevel; @@ -217,12 +213,12 @@ static void list_annotations(FILE *fp) "\n"); } -static void print_usage(FILE *fp, const mplp_conf_t *mplp) +static void print_usage(args_t *args, FILE *fp) { - char *tmp_skip_all_set = bam_flag2str(mplp->rflag_skip_all_set); - char *tmp_skip_any_unset = bam_flag2str(mplp->rflag_skip_any_unset); - char *tmp_skip_all_unset = bam_flag2str(mplp->rflag_skip_all_unset); - char *tmp_skip_any_set = bam_flag2str(mplp->rflag_skip_any_set); + char *tmp_skip_all_set = bam_flag2str(args->rflag_skip_all_set); + char *tmp_skip_any_unset = bam_flag2str(args->rflag_skip_any_unset); + char *tmp_skip_all_unset = bam_flag2str(args->rflag_skip_all_unset); + char *tmp_skip_any_set = bam_flag2str(args->rflag_skip_any_set); fprintf(fp, "\n" @@ -235,19 +231,19 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -B, --no-BAQ Disable BAQ (per-Base Alignment Quality)\n" " -C, --adjust-MQ INT Adjust mapping quality [0]\n" " -D, --full-BAQ Apply BAQ everywhere, not just in problematic regions\n" - " -d, --max-depth INT Max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); + " -d, --max-depth INT Max raw per-file depth; avoids excessive memory usage [%d]\n", mpileup_get_opt(args->mplp, int, MAX_DP_PER_FILE)); fprintf(fp, " -E, --redo-BAQ Recalculate BAQ on the fly, ignore existing BQs\n" " -f, --fasta-ref FILE Faidx indexed reference sequence file\n" " --no-reference Do not require fasta reference file\n" " -G, --read-groups FILE Select or exclude read groups listed in the file\n" - " -q, --min-MQ INT Skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq); + " -q, --min-MQ INT Skip alignments with mapQ smaller than INT [%d]\n", mpileup_get_opt(args->mplp, int, MIN_MQ)); fprintf(fp, - " -Q, --min-BQ INT Skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ); + " -Q, --min-BQ INT Skip bases with baseQ/BAQ smaller than INT [%d]\n", mpileup_get_opt(args->mplp, int, MIN_BQ)); fprintf(fp, - " --max-BQ INT Limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ); + " --max-BQ INT Limit baseQ/BAQ to no more than INT [%d]\n", mpileup_get_opt(args->mplp, int, MAX_BQ)); fprintf(fp, - " --delta-BQ INT Use neighbour_qual + INT if less than qual [%d]\n", mplp->delta_baseQ); + " --delta-BQ INT Use neighbour_qual + INT if less than qual [%d]\n", mpileup_get_opt(args->mplp, int, DELTA_BQ)); fprintf(fp, " -r, --regions REG[,...] Comma separated list of regions in which pileup is generated\n" " -R, --regions-file FILE Restrict to regions listed in a file\n" @@ -278,28 +274,28 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) "\n" "SNP/INDEL genotype likelihoods options:\n" " -X, --config STR Specify platform specific profiles (see below)\n" - " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ); + " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", args->bca.extQ); fprintf(fp, - " -F, --gap-frac FLOAT Minimum fraction of gapped reads [%g]\n", mplp->min_frac); + " -F, --gap-frac FLOAT Minimum fraction of gapped reads [%f]\n", mpileup_get_opt(args->mplp, float, MIN_REALN_FRAC)); fprintf(fp, - " -h, --tandem-qual INT Coefficient for homopolymer errors [%d]\n", mplp->tandemQ); + " -h, --tandem-qual INT Coefficient for homopolymer errors [%d]\n", args->bca.tandemQ); fprintf(fp, " -I, --skip-indels Do not perform indel calling\n" - " -L, --max-idepth INT Maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth); + " -L, --max-idepth INT Subsample to maximum per-sample depth for INDEL calling [%d]\n", mpileup_get_opt(args->mplp, int, MAX_REALN_DP)); fprintf(fp, - " -m, --min-ireads INT Minimum number gapped reads for indel candidates [%d]\n", mplp->min_support); + " -m, --min-ireads INT Minimum number gapped reads for indel candidates [%d]\n", mpileup_get_opt(args->mplp, int, MIN_REALN_DP)); fprintf(fp, - " -M, --max-read-len INT Maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len); + " -M, --max-read-len INT Maximum length of read to pass to BAQ algorithm [%d]\n", mpileup_get_opt(args->mplp, int, MAX_REALN_LEN)); fprintf(fp, - " -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ); + " -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", args->bca.openQ); fprintf(fp, " -p, --per-sample-mF Apply -m and -F per-sample for increased sensitivity\n" " -P, --platforms STR Comma separated list of platforms for indels [all]\n" " --ar, --ambig-reads STR What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n"); fprintf(fp, - " --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias); + " --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", args->bca.indel_bias); fprintf(fp, - " --indel-size INT Approximate maximum indel size considered [%d]\n", mplp->indel_win_size); + " --indel-size INT Approximate maximum indel size considered [%d]\n", args->bca.indel_win_size); fprintf(fp,"\n"); fprintf(fp, "Configuration profiles activated with -X, --config:\n" @@ -321,15 +317,32 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) free(tmp_skip_any_set); } +static void exit_nicely(args_t *args) +{ + mpileup_destroy(args->mplp); + free(args); +} + int main_mpileup2(int argc, char *argv[]) { int c, ret = 0; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; - args->mplp = mpileup_init(); args->rflag_skip_any_set = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; args->flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL | MPLP_SMART_OVERLAPS; - args->bca.openQ = 40; + args->bca.openQ = 40; + args->bca.extQ = 20; + args->bca.tandemQ = 500; + + args->mplp = mpileup_init(); + mpileup_set_opt(args->mplp, int, MAX_DP_PER_FILE, 250); + mpileup_set_opt(args->mplp, int, MIN_MQ, 0); + mpileup_set_opt(args->mplp, int, MAX_BQ, 60); + mpileup_set_opt(args->mplp, int, DELTA_BQ, 30); + mpileup_set_opt(args->mplp, float, MIN_REALN_FRAC, 0.05); + mpileup_set_opt(args->mplp, int, MIN_REALN_DP, 2); + mpileup_set_opt(args->mplp, int, MAX_REALN_DP, 250); + mpileup_set_opt(args->mplp, int, MAX_REALN_LEN, 500); int noref = 0; @@ -339,7 +352,6 @@ int main_mpileup2(int argc, char *argv[]) int nfiles = 0, use_orphan = 0, noref = 0; mplp_conf_t mplp; memset(&mplp, 0, sizeof(mplp_conf_t)); - mplp.min_baseQ = 1; mplp.max_baseQ = 60; mplp.delta_baseQ = 30; mplp.capQ_thres = 0; @@ -545,9 +557,9 @@ int main_mpileup2(int argc, char *argv[]) } break; case 'A': args->flag &= ~MPLP_NO_ORPHAN; break; - case 'F': args->bca.min_frac = atof(optarg); break; - case 'm': args->bca.min_support = atoi(optarg); break; - case 'L': error("todo: -L\n"); /* mplp.max_indel_depth = atoi(optarg); */ break; + case 'F': mpileup_set_opt(args->mplp, float, MIN_REALN_FRAC, atof(optarg)); break; + case 'm': mpileup_set_opt(args->mplp, int, MIN_REALN_DP, atoi(optarg)); break; + case 'L': mpileup_set_opt(args->mplp, int, MAX_REALN_DP, atoi(optarg)); break; case 'G': mpileup_set_opt(args->mplp, char*, READ_GROUPS_FNAME, optarg); break; case 'a': if (optarg[0]=='?') { @@ -559,26 +571,26 @@ int main_mpileup2(int argc, char *argv[]) case 'M': args->max_read_len = atoi(optarg); break; case 'X': if (strcasecmp(optarg, "pacbio-ccs") == 0) { - args->bca.min_frac = 0.1; - args->bca.min_baseQ = 5; - args->bca.max_baseQ = 50; - args->bca.delta_baseQ = 10; + mpileup_set_opt(args->mplp, float, MIN_REALN_FRAC, 0.1); + mpileup_set_opt(args->mplp, int, MIN_BQ, 5); + mpileup_set_opt(args->mplp, int, MAX_BQ, 50); + mpileup_set_opt(args->mplp, int, DELTA_BQ, 10); args->bca.openQ = 25; args->bca.extQ = 1; args->flag |= MPLP_REALN_PARTIAL; - args->max_read_len = 99999; + mpileup_set_opt(args->mplp, int, MAX_REALN_LEN, 99999); } else if (strcasecmp(optarg, "ont") == 0) { fprintf(stderr, "For ONT it may be beneficial to also run bcftools call with " "a higher -P, eg -P0.01 or -P 0.1\n"); - args->bca.min_baseQ = 5; - args->bca.max_baseQ = 30; + mpileup_set_opt(args->mplp, int, MIN_BQ, 5); + mpileup_set_opt(args->mplp, int, MAX_BQ, 30); args->flag &= ~MPLP_REALN; args->flag |= MPLP_NO_INDEL; } else if (strcasecmp(optarg, "1.12") == 0) { // 1.12 and earlier - args->bca.min_frac = 0.002; + mpileup_set_opt(args->mplp, float, MIN_REALN_FRAC, 0.002); args->bca.min_support = 1; - args->bca.min_baseQ = 13; + mpileup_set_opt(args->mplp, int, MIN_BQ, 13); args->bca.tandemQ = 100; args->flag &= ~MPLP_REALN_PARTIAL; args->flag |= MPLP_REALN; @@ -605,35 +617,21 @@ int main_mpileup2(int argc, char *argv[]) } } -#if 0 - if ( mplp.gvcf && !(mplp.fmt_flag&B2B_FMT_DP) ) - { - fprintf(stderr,"[warning] The -a DP option is required with --gvcf, switching on.\n"); - mplp.fmt_flag |= B2B_FMT_DP; - } - if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) ) + if ( args->gvcf && !(args->bca.fmt_flag&B2B_FMT_DP) ) args->bca.fmt_flag |= B2B_FMT_DP; + if ( argc==1 ) { - if ( mplp.flag&MPLP_VCF ) - { - if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_VCF; - else mplp.output_type = FT_VCF_GZ; - } - else if ( mplp.flag&MPLP_BCF ) - { - if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_BCF; - else mplp.output_type = FT_BCF_GZ; - } + print_usage(args, stderr); + exit_nicely(args); + return 1; } + exit_nicely(args); + +#if 0 if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ ) { fprintf(stderr,"Error: The -B option cannot be combined with -E\n"); return 1; } - if (argc == 1) - { - print_usage(stderr, &mplp); - return 1; - } if (!mplp.fai && !noref) { fprintf(stderr,"Error: mpileup requires the --fasta-ref option by default; use --no-reference to run without a fasta reference\n"); return 1; diff --git a/mpileup2/mpileup.c b/mpileup2/mpileup.c index 8a0069370..63049e9b3 100644 --- a/mpileup2/mpileup.c +++ b/mpileup2/mpileup.c @@ -25,23 +25,104 @@ */ #include +#include +#include #include "mpileup.h" +void error(const char *format, ...); +void mplp_add_bam(mpileup_t *mplp, char *bam); +void mplp_add_bams(mpileup_t *mplp, char *bams_fname); + struct _mpileup_t { - int x; + char *fasta_ref, *regions, *regions_fname, *targets, *targets_fname, *samples, *samples_fname, *read_groups_fname; + int smart_overlaps, smpl_ignore_rg, max_dp_per_file, adjust_mq, min_mq, min_bq, max_bq, delta_bq; + int min_realn_dp, max_realn_dp, max_realn_len; + float min_realn_frac; }; mpileup_t *mpileup_init(void) { - return NULL; + mpileup_t *mplp = (mpileup_t*) calloc(1,sizeof(mpileup_t)); + mpileup_set_opt(mplp,int,MAX_DP_PER_FILE,250); + mpileup_set_opt(mplp,int,MIN_BQ,1); + mpileup_set_opt(mplp,int,MAX_BQ,60); + mpileup_set_opt(mplp,int,DELTA_BQ,30); + mpileup_set_opt(mplp,float,MIN_REALN_FRAC,0.05); + mpileup_set_opt(mplp,int,MIN_REALN_DP,2); + mpileup_set_opt(mplp,int,MAX_REALN_DP,250); + mpileup_set_opt(mplp,int,MAX_REALN_LEN,500); + return mplp; +} + +int mpileup_set(mpileup_t *mplp, mpileup_opt_t key, void *value) +{ + switch (key) + { + case FASTA_REF: free(mplp->fasta_ref); mplp->fasta_ref = strdup(*((char**)value)); break; + case BAM: mplp_add_bam(mplp,*((char**)value)); break; + case BAM_FNAME: mplp_add_bams(mplp,*((char**)value)); break; + case REGIONS: free(mplp->regions); mplp->regions = strdup(*((char**)value)); break; + case REGIONS_FNAME: free(mplp->regions_fname); mplp->regions_fname = strdup(*((char**)value)); break; + case TARGETS: free(mplp->targets); mplp->targets = strdup(*((char**)value)); break; + case TARGETS_FNAME: free(mplp->targets_fname); mplp->targets_fname = strdup(*((char**)value)); break; + case SAMPLES: free(mplp->samples); mplp->samples = strdup(*((char**)value)); break; + case SAMPLES_FNAME: free(mplp->samples_fname); mplp->samples_fname = strdup(*((char**)value)); break; + case READ_GROUPS_FNAME: free(mplp->read_groups_fname); mplp->read_groups_fname = strdup(*((char**)value)); break; + case SMART_OVERLAPS: mplp->smart_overlaps = *((int*)value); break; + case SMPL_IGNORE_RG: mplp->smpl_ignore_rg = *((int*)value); break; + case MAX_DP_PER_FILE: mplp->max_dp_per_file = *((int*)value); break; + case ADJUST_MQ: mplp->adjust_mq = *((int*)value); break; + case MIN_MQ: mplp->min_mq = *((int*)value); break; + case MIN_BQ: mplp->min_bq = *((int*)value); break; + case MAX_BQ: mplp->max_bq = *((int*)value); break; + case DELTA_BQ: mplp->delta_bq = *((int*)value); break; + case MIN_REALN_FRAC: mplp->min_realn_frac = *((float*)value); break; + case MIN_REALN_DP: mplp->min_realn_dp = *((int*)value); break; + case MAX_REALN_DP: mplp->max_realn_dp = *((int*)value); break; + case MAX_REALN_LEN: mplp->max_realn_len = *((int*)value); break; + default: error("Todo: mpileup_set key=%d\n",(int)key); break; + } + return 0; } -void mpileup_set(mpileup_t *mplp, mpileup_opt_t key, void *value) +void *mpileup_get(mpileup_t *mplp, mpileup_opt_t key) { + switch (key) + { + case SMART_OVERLAPS: return &mplp->smart_overlaps; break; + case SMPL_IGNORE_RG: return &mplp->smpl_ignore_rg; break; + case MAX_DP_PER_FILE: return &mplp->max_dp_per_file; break; + case ADJUST_MQ: return &mplp->adjust_mq; break; + case MIN_MQ: return &mplp->min_mq; break; + case MIN_BQ: return &mplp->min_bq; break; + case MAX_BQ: return &mplp->max_bq; break; + case DELTA_BQ: return &mplp->delta_bq; break; + case MIN_REALN_FRAC: return &mplp->min_realn_frac; break; + case MIN_REALN_DP: return &mplp->min_realn_dp; break; + case MAX_REALN_DP: return &mplp->max_realn_dp; break; + case MAX_REALN_LEN: return &mplp->max_realn_len; break; + default: error("Todo: mpileup_get key=%d\n",(int)key); break; + } + return NULL; } void mpileup_destroy(mpileup_t *mplp) { + free(mplp->fasta_ref); + free(mplp->regions); + free(mplp->regions_fname); + free(mplp->targets); + free(mplp->targets_fname); + free(mplp->samples); + free(mplp->samples_fname); + free(mplp->read_groups_fname); + free(mplp); } +void mplp_add_bam(mpileup_t *mplp, char *bam) +{ +} +void mplp_add_bams(mpileup_t *mplp, char *bams_fname) +{ +} diff --git a/mpileup2/mpileup.h b/mpileup2/mpileup.h index 4b73edf83..43352cc70 100644 --- a/mpileup2/mpileup.h +++ b/mpileup2/mpileup.h @@ -57,13 +57,19 @@ typedef enum MIN_BQ, // int, skip bases with baseQ smaller than MIN_BQ [1] MAX_BQ, // int, cap BQ for overly optimistics platforms [60] DELTA_BQ, // int, --delta-BQ value for downgrading high BQ if neighbor_BQ is < BQ - 30 [30] + MIN_REALN_FRAC, // float, minimum fraction of reads with evidence for an indel to consider the site needing realignment [0.05] + MIN_REALN_DP, // int, minimum number of reads with evidence for an indel to consider the site needing realignment [2] + MAX_REALN_DP, // int, subsample to this many reads for realignment [250] + MAX_REALN_LEN, // int, realign reads this long max [500] } mpileup_opt_t; -#define mpileup_set_opt(es,type,key,value) { type tmp = value; mpileup_set(es, key, (void*)&tmp); } +#define mpileup_set_opt(mplp,type,key,value) { type tmp = value; mpileup_set(mplp, key, (void*)&tmp); } +#define mpileup_get_opt(mplp,type,key) (*(type*)mpileup_get(mplp, key)) mpileup_t *mpileup_init(void); -void mpileup_set(mpileup_t *mplp, mpileup_opt_t key, void *value); +int mpileup_set(mpileup_t *mplp, mpileup_opt_t key, void *value); +void *mpileup_get(mpileup_t *mplp, mpileup_opt_t key); void mpileup_destroy(mpileup_t *mplp); #endif From 3ef5f6a4e7cb584dfe7cc23f989f1d26bec9f746 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 23 Nov 2022 17:03:29 +0000 Subject: [PATCH 3/6] Backup commit. Compiles but not functional --- mpileup.c | 16 ----- mpileup2.c | 124 ++++++++++++++++++++------------------- mpileup2/mpileup.c | 143 ++++++++++++++++++++++++++++++++++++++++++++- mpileup2/mpileup.h | 17 ++++-- 4 files changed, 215 insertions(+), 85 deletions(-) diff --git a/mpileup.c b/mpileup.c index e0fea92a8..7241d9ba2 100644 --- a/mpileup.c +++ b/mpileup.c @@ -47,9 +47,6 @@ DEALINGS IN THE SOFTWARE. */ #include "bam_sample.h" #include "gvcf.h" -#define MPLP_BCF 1 -#define MPLP_VCF (1<<1) -#define MPLP_NO_COMP (1<<2) #define MPLP_NO_ORPHAN (1<<3) #define MPLP_REALN (1<<4) #define MPLP_NO_INDEL (1<<5) @@ -1560,19 +1557,6 @@ int main_mpileup(int argc, char *argv[]) fprintf(stderr,"[warning] The -a DP option is required with --gvcf, switching on.\n"); mplp.fmt_flag |= B2B_FMT_DP; } - if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) ) - { - if ( mplp.flag&MPLP_VCF ) - { - if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_VCF; - else mplp.output_type = FT_VCF_GZ; - } - else if ( mplp.flag&MPLP_BCF ) - { - if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_BCF; - else mplp.output_type = FT_BCF_GZ; - } - } if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ ) { fprintf(stderr,"Error: The -B option cannot be combined with -E\n"); diff --git a/mpileup2.c b/mpileup2.c index b0083a1ee..3d3ad00ab 100644 --- a/mpileup2.c +++ b/mpileup2.c @@ -59,9 +59,6 @@ controlled via mpileup_set_option() #include "bam2bcf.h" #include "gvcf.h" -#define MPLP_BCF 1 -#define MPLP_VCF (1<<1) -#define MPLP_NO_COMP (1<<2) #define MPLP_NO_ORPHAN (1<<3) #define MPLP_REALN (1<<4) #define MPLP_NO_INDEL (1<<5) @@ -94,7 +91,6 @@ typedef struct { max_indel_depth, max_read_len, fmt_flag, ambig_reads; int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type; int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels - double min_frac; // for indels double indel_bias; char *reg_fname, *pl_list, *fai_fname, *output_fname; int reg_is_file, record_cmd_line, n_threads, clevel; @@ -217,12 +213,12 @@ static void list_annotations(FILE *fp) "\n"); } -static void print_usage(FILE *fp, const mplp_conf_t *mplp) +static void print_usage(args_t *args, FILE *fp) { - char *tmp_skip_all_set = bam_flag2str(mplp->rflag_skip_all_set); - char *tmp_skip_any_unset = bam_flag2str(mplp->rflag_skip_any_unset); - char *tmp_skip_all_unset = bam_flag2str(mplp->rflag_skip_all_unset); - char *tmp_skip_any_set = bam_flag2str(mplp->rflag_skip_any_set); + char *tmp_skip_all_set = bam_flag2str(args->rflag_skip_all_set); + char *tmp_skip_any_unset = bam_flag2str(args->rflag_skip_any_unset); + char *tmp_skip_all_unset = bam_flag2str(args->rflag_skip_all_unset); + char *tmp_skip_any_set = bam_flag2str(args->rflag_skip_any_set); fprintf(fp, "\n" @@ -235,19 +231,19 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -B, --no-BAQ Disable BAQ (per-Base Alignment Quality)\n" " -C, --adjust-MQ INT Adjust mapping quality [0]\n" " -D, --full-BAQ Apply BAQ everywhere, not just in problematic regions\n" - " -d, --max-depth INT Max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); + " -d, --max-depth INT Max raw per-file depth; avoids excessive memory usage [%d]\n", mpileup_get_opt(args->mplp, int, MAX_DP_PER_FILE)); fprintf(fp, " -E, --redo-BAQ Recalculate BAQ on the fly, ignore existing BQs\n" " -f, --fasta-ref FILE Faidx indexed reference sequence file\n" " --no-reference Do not require fasta reference file\n" " -G, --read-groups FILE Select or exclude read groups listed in the file\n" - " -q, --min-MQ INT Skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq); + " -q, --min-MQ INT Skip alignments with mapQ smaller than INT [%d]\n", mpileup_get_opt(args->mplp, int, MIN_MQ)); fprintf(fp, - " -Q, --min-BQ INT Skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ); + " -Q, --min-BQ INT Skip bases with baseQ/BAQ smaller than INT [%d]\n", mpileup_get_opt(args->mplp, int, MIN_BQ)); fprintf(fp, - " --max-BQ INT Limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ); + " --max-BQ INT Limit baseQ/BAQ to no more than INT [%d]\n", mpileup_get_opt(args->mplp, int, MAX_BQ)); fprintf(fp, - " --delta-BQ INT Use neighbour_qual + INT if less than qual [%d]\n", mplp->delta_baseQ); + " --delta-BQ INT Use neighbour_qual + INT if less than qual [%d]\n", mpileup_get_opt(args->mplp, int, DELTA_BQ)); fprintf(fp, " -r, --regions REG[,...] Comma separated list of regions in which pileup is generated\n" " -R, --regions-file FILE Restrict to regions listed in a file\n" @@ -278,28 +274,28 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) "\n" "SNP/INDEL genotype likelihoods options:\n" " -X, --config STR Specify platform specific profiles (see below)\n" - " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ); + " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", args->bca.extQ); fprintf(fp, - " -F, --gap-frac FLOAT Minimum fraction of gapped reads [%g]\n", mplp->min_frac); + " -F, --gap-frac FLOAT Minimum fraction of gapped reads [%f]\n", mpileup_get_opt(args->mplp, float, MIN_REALN_FRAC)); fprintf(fp, - " -h, --tandem-qual INT Coefficient for homopolymer errors [%d]\n", mplp->tandemQ); + " -h, --tandem-qual INT Coefficient for homopolymer errors [%d]\n", args->bca.tandemQ); fprintf(fp, " -I, --skip-indels Do not perform indel calling\n" - " -L, --max-idepth INT Maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth); + " -L, --max-idepth INT Subsample to maximum per-sample depth for INDEL calling [%d]\n", mpileup_get_opt(args->mplp, int, MAX_REALN_DP)); fprintf(fp, - " -m, --min-ireads INT Minimum number gapped reads for indel candidates [%d]\n", mplp->min_support); + " -m, --min-ireads INT Minimum number gapped reads for indel candidates [%d]\n", mpileup_get_opt(args->mplp, int, MIN_REALN_DP)); fprintf(fp, - " -M, --max-read-len INT Maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len); + " -M, --max-read-len INT Maximum length of read to pass to BAQ algorithm [%d]\n", mpileup_get_opt(args->mplp, int, MAX_REALN_LEN)); fprintf(fp, - " -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ); + " -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", args->bca.openQ); fprintf(fp, " -p, --per-sample-mF Apply -m and -F per-sample for increased sensitivity\n" " -P, --platforms STR Comma separated list of platforms for indels [all]\n" " --ar, --ambig-reads STR What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n"); fprintf(fp, - " --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias); + " --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", args->bca.indel_bias); fprintf(fp, - " --indel-size INT Approximate maximum indel size considered [%d]\n", mplp->indel_win_size); + " --indel-size INT Approximate maximum indel size considered [%d]\n", args->bca.indel_win_size); fprintf(fp,"\n"); fprintf(fp, "Configuration profiles activated with -X, --config:\n" @@ -321,15 +317,32 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) free(tmp_skip_any_set); } +static void exit_nicely(args_t *args) +{ + mpileup_destroy(args->mplp); + free(args); +} + int main_mpileup2(int argc, char *argv[]) { int c, ret = 0; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; - args->mplp = mpileup_init(); args->rflag_skip_any_set = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; args->flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL | MPLP_SMART_OVERLAPS; - args->bca.openQ = 40; + args->bca.openQ = 40; + args->bca.extQ = 20; + args->bca.tandemQ = 500; + + args->mplp = mpileup_init(); + mpileup_set_opt(args->mplp, int, MAX_DP_PER_FILE, 250); + mpileup_set_opt(args->mplp, int, MIN_MQ, 0); + mpileup_set_opt(args->mplp, int, MAX_BQ, 60); + mpileup_set_opt(args->mplp, int, DELTA_BQ, 30); + mpileup_set_opt(args->mplp, float, MIN_REALN_FRAC, 0.05); + mpileup_set_opt(args->mplp, int, MIN_REALN_DP, 2); + mpileup_set_opt(args->mplp, int, MAX_REALN_DP, 250); + mpileup_set_opt(args->mplp, int, MAX_REALN_LEN, 500); int noref = 0; @@ -339,7 +352,6 @@ int main_mpileup2(int argc, char *argv[]) int nfiles = 0, use_orphan = 0, noref = 0; mplp_conf_t mplp; memset(&mplp, 0, sizeof(mplp_conf_t)); - mplp.min_baseQ = 1; mplp.max_baseQ = 60; mplp.delta_baseQ = 30; mplp.capQ_thres = 0; @@ -520,7 +532,7 @@ int main_mpileup2(int argc, char *argv[]) case 'Q': mpileup_set_opt(args->mplp, int, MIN_BQ, atoi(optarg)); break; case 11: mpileup_set_opt(args->mplp, int, MAX_BQ, atoi(optarg)); break; case 12: mpileup_set_opt(args->mplp, int, DELTA_BQ, atoi(optarg)); break; - case 'b': mpileup_set_opt(args->mplp, char*, BAM_FNAME, optarg); break; + case 'b': if ( mpileup_set(args->mplp, BAM_FNAME, &optarg)!=0 ) exit_nicely(args); break; case 'o': /* this option used to be -o INT and -o FILE. In the new code disable -o as an alias of --open-prob as its use is very rare */ args->output_fname = optarg; @@ -545,9 +557,9 @@ int main_mpileup2(int argc, char *argv[]) } break; case 'A': args->flag &= ~MPLP_NO_ORPHAN; break; - case 'F': args->bca.min_frac = atof(optarg); break; - case 'm': args->bca.min_support = atoi(optarg); break; - case 'L': error("todo: -L\n"); /* mplp.max_indel_depth = atoi(optarg); */ break; + case 'F': mpileup_set_opt(args->mplp, float, MIN_REALN_FRAC, atof(optarg)); break; + case 'm': mpileup_set_opt(args->mplp, int, MIN_REALN_DP, atoi(optarg)); break; + case 'L': mpileup_set_opt(args->mplp, int, MAX_REALN_DP, atoi(optarg)); break; case 'G': mpileup_set_opt(args->mplp, char*, READ_GROUPS_FNAME, optarg); break; case 'a': if (optarg[0]=='?') { @@ -559,26 +571,26 @@ int main_mpileup2(int argc, char *argv[]) case 'M': args->max_read_len = atoi(optarg); break; case 'X': if (strcasecmp(optarg, "pacbio-ccs") == 0) { - args->bca.min_frac = 0.1; - args->bca.min_baseQ = 5; - args->bca.max_baseQ = 50; - args->bca.delta_baseQ = 10; + mpileup_set_opt(args->mplp, float, MIN_REALN_FRAC, 0.1); + mpileup_set_opt(args->mplp, int, MIN_BQ, 5); + mpileup_set_opt(args->mplp, int, MAX_BQ, 50); + mpileup_set_opt(args->mplp, int, DELTA_BQ, 10); args->bca.openQ = 25; args->bca.extQ = 1; args->flag |= MPLP_REALN_PARTIAL; - args->max_read_len = 99999; + mpileup_set_opt(args->mplp, int, MAX_REALN_LEN, 99999); } else if (strcasecmp(optarg, "ont") == 0) { fprintf(stderr, "For ONT it may be beneficial to also run bcftools call with " "a higher -P, eg -P0.01 or -P 0.1\n"); - args->bca.min_baseQ = 5; - args->bca.max_baseQ = 30; + mpileup_set_opt(args->mplp, int, MIN_BQ, 5); + mpileup_set_opt(args->mplp, int, MAX_BQ, 30); args->flag &= ~MPLP_REALN; args->flag |= MPLP_NO_INDEL; } else if (strcasecmp(optarg, "1.12") == 0) { // 1.12 and earlier - args->bca.min_frac = 0.002; + mpileup_set_opt(args->mplp, float, MIN_REALN_FRAC, 0.002); args->bca.min_support = 1; - args->bca.min_baseQ = 13; + mpileup_set_opt(args->mplp, int, MIN_BQ, 13); args->bca.tandemQ = 100; args->flag &= ~MPLP_REALN_PARTIAL; args->flag |= MPLP_REALN; @@ -605,35 +617,25 @@ int main_mpileup2(int argc, char *argv[]) } } -#if 0 - if ( mplp.gvcf && !(mplp.fmt_flag&B2B_FMT_DP) ) - { - fprintf(stderr,"[warning] The -a DP option is required with --gvcf, switching on.\n"); - mplp.fmt_flag |= B2B_FMT_DP; - } - if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) ) + if ( args->gvcf && !(args->bca.fmt_flag&B2B_FMT_DP) ) args->bca.fmt_flag |= B2B_FMT_DP; + if ( argc==1 ) { - if ( mplp.flag&MPLP_VCF ) - { - if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_VCF; - else mplp.output_type = FT_VCF_GZ; - } - else if ( mplp.flag&MPLP_BCF ) - { - if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_BCF; - else mplp.output_type = FT_BCF_GZ; - } + print_usage(args, stderr); + exit_nicely(args); + return 1; } + int i; + for (i=0; implp, BAM, &argv[optind+i])!=0 ) exit_nicely(args); + + exit_nicely(args); + +#if 0 if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ ) { fprintf(stderr,"Error: The -B option cannot be combined with -E\n"); return 1; } - if (argc == 1) - { - print_usage(stderr, &mplp); - return 1; - } if (!mplp.fai && !noref) { fprintf(stderr,"Error: mpileup requires the --fasta-ref option by default; use --no-reference to run without a fasta reference\n"); return 1; diff --git a/mpileup2/mpileup.c b/mpileup2/mpileup.c index 8a0069370..3cef69653 100644 --- a/mpileup2/mpileup.c +++ b/mpileup2/mpileup.c @@ -25,23 +25,160 @@ */ #include +#include +#include +#include +#include +#include #include "mpileup.h" +static int mplp_add_bam(mpileup_t *mplp, char *bam, int is_file); + struct _mpileup_t { - int x; + char *fasta_ref, *regions, *regions_fname, *targets, *targets_fname, *samples, *samples_fname, *read_groups_fname; + int smart_overlaps, smpl_ignore_rg, max_dp_per_file, adjust_mq, min_mq, min_bq, max_bq, delta_bq; + int min_realn_dp, max_realn_dp, max_realn_len; + float min_realn_frac; + char **bam_names; + int nbam_names; }; mpileup_t *mpileup_init(void) { - return NULL; + mpileup_t *mplp = (mpileup_t*) calloc(1,sizeof(mpileup_t)); + mpileup_set_opt(mplp,int,MAX_DP_PER_FILE,250); + mpileup_set_opt(mplp,int,MIN_BQ,1); + mpileup_set_opt(mplp,int,MAX_BQ,60); + mpileup_set_opt(mplp,int,DELTA_BQ,30); + mpileup_set_opt(mplp,float,MIN_REALN_FRAC,0.05); + mpileup_set_opt(mplp,int,MIN_REALN_DP,2); + mpileup_set_opt(mplp,int,MAX_REALN_DP,250); + mpileup_set_opt(mplp,int,MAX_REALN_LEN,500); + return mplp; +} + +int mpileup_set(mpileup_t *mplp, mpileup_opt_t key, void *value) +{ + switch (key) + { + case FASTA_REF: free(mplp->fasta_ref); mplp->fasta_ref = strdup(*((char**)value)); break; + case BAM: return mplp_add_bam(mplp,*((char**)value),0); break; + case BAM_FNAME: return mplp_add_bam(mplp,*((char**)value),1); break; + case REGIONS: free(mplp->regions); mplp->regions = strdup(*((char**)value)); break; + case REGIONS_FNAME: free(mplp->regions_fname); mplp->regions_fname = strdup(*((char**)value)); break; + case TARGETS: free(mplp->targets); mplp->targets = strdup(*((char**)value)); break; + case TARGETS_FNAME: free(mplp->targets_fname); mplp->targets_fname = strdup(*((char**)value)); break; + case SAMPLES: free(mplp->samples); mplp->samples = strdup(*((char**)value)); break; + case SAMPLES_FNAME: free(mplp->samples_fname); mplp->samples_fname = strdup(*((char**)value)); break; + case READ_GROUPS_FNAME: free(mplp->read_groups_fname); mplp->read_groups_fname = strdup(*((char**)value)); break; + case SMART_OVERLAPS: mplp->smart_overlaps = *((int*)value); break; + case SMPL_IGNORE_RG: mplp->smpl_ignore_rg = *((int*)value); break; + case MAX_DP_PER_FILE: mplp->max_dp_per_file = *((int*)value); break; + case ADJUST_MQ: mplp->adjust_mq = *((int*)value); break; + case MIN_MQ: mplp->min_mq = *((int*)value); break; + case MIN_BQ: mplp->min_bq = *((int*)value); break; + case MAX_BQ: mplp->max_bq = *((int*)value); break; + case DELTA_BQ: mplp->delta_bq = *((int*)value); break; + case MIN_REALN_FRAC: mplp->min_realn_frac = *((float*)value); break; + case MIN_REALN_DP: mplp->min_realn_dp = *((int*)value); break; + case MAX_REALN_DP: mplp->max_realn_dp = *((int*)value); break; + case MAX_REALN_LEN: mplp->max_realn_len = *((int*)value); break; + default: hts_log_error("Todo: mpileup_set key=%d",(int)key); return 1; break; + } + return 0; } -void mpileup_set(mpileup_t *mplp, mpileup_opt_t key, void *value) +void *mpileup_get(mpileup_t *mplp, mpileup_opt_t key) { + switch (key) + { + case SMART_OVERLAPS: return &mplp->smart_overlaps; break; + case SMPL_IGNORE_RG: return &mplp->smpl_ignore_rg; break; + case MAX_DP_PER_FILE: return &mplp->max_dp_per_file; break; + case ADJUST_MQ: return &mplp->adjust_mq; break; + case MIN_MQ: return &mplp->min_mq; break; + case MIN_BQ: return &mplp->min_bq; break; + case MAX_BQ: return &mplp->max_bq; break; + case DELTA_BQ: return &mplp->delta_bq; break; + case MIN_REALN_FRAC: return &mplp->min_realn_frac; break; + case MIN_REALN_DP: return &mplp->min_realn_dp; break; + case MAX_REALN_DP: return &mplp->max_realn_dp; break; + case MAX_REALN_LEN: return &mplp->max_realn_len; break; + default: hts_log_error("Todo: mpileup_get key=%d",(int)key); return NULL; break; + } + return NULL; } void mpileup_destroy(mpileup_t *mplp) { + int i; + for (i=0; inbam_names; i++) free(mplp->bam_names[i]); + free(mplp->bam_names); + free(mplp->fasta_ref); + free(mplp->regions); + free(mplp->regions_fname); + free(mplp->targets); + free(mplp->targets_fname); + free(mplp->samples); + free(mplp->samples_fname); + free(mplp->read_groups_fname); + free(mplp); +} + +static int is_url(const char *str) +{ + static const char uri_scheme_chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"; + return str[strspn(str, uri_scheme_chars)] == ':'; +} + +static int mplp_add_bam(mpileup_t *mplp, char *bam, int is_file) +{ + struct stat sbuf; + if ( is_file ) + { + int i, nnames = 0; + char **names = hts_readlist(bam, 1, &nnames); + for (i=0; inbam_names++; + char **tmp = (char**)realloc(mplp->bam_names,mplp->nbam_names*sizeof(*mplp->bam_names)); + if ( !tmp ) + { + hts_log_error("Could not allocate %zu bytes",mplp->nbam_names*sizeof(*mplp->bam_names)); + break; + } + mplp->bam_names = tmp; + mplp->bam_names[mplp->nbam_names-1] = strdup(names[i]); + } + for (i=0; inbam_names++; + char **tmp = (char**)realloc(mplp->bam_names,mplp->nbam_names*sizeof(*mplp->bam_names)); + if ( !tmp ) + { + hts_log_error("Could not allocate %zu bytes",mplp->nbam_names*sizeof(*mplp->bam_names)); + return 1; + } + mplp->bam_names = tmp; + mplp->bam_names[mplp->nbam_names-1] = strdup(bam); + return 0; } diff --git a/mpileup2/mpileup.h b/mpileup2/mpileup.h index 4b73edf83..08edcae7e 100644 --- a/mpileup2/mpileup.h +++ b/mpileup2/mpileup.h @@ -29,13 +29,14 @@ typedef struct _mpileup_t mpileup_t; -// Various options +// Various options. Key marked as 'can fail' shoud be set with mpileup_set() and the +// return status checked as these can fail. typedef enum { // required FASTA_REF, // char*, fasta reference - BAM, // char*, add another BAM/CRAM - BAM_FNAME, // char*, read list of BAMs/CRAMs from a file + BAM, // char*, add another BAM/CRAM. Can fail. + BAM_FNAME, // char*, read list of BAMs/CRAMs from a file. Can fail. // optional REGIONS, // char*, comma-separated regions @@ -57,13 +58,19 @@ typedef enum MIN_BQ, // int, skip bases with baseQ smaller than MIN_BQ [1] MAX_BQ, // int, cap BQ for overly optimistics platforms [60] DELTA_BQ, // int, --delta-BQ value for downgrading high BQ if neighbor_BQ is < BQ - 30 [30] + MIN_REALN_FRAC, // float, minimum fraction of reads with evidence for an indel to consider the site needing realignment [0.05] + MIN_REALN_DP, // int, minimum number of reads with evidence for an indel to consider the site needing realignment [2] + MAX_REALN_DP, // int, subsample to this many reads for realignment [250] + MAX_REALN_LEN, // int, realign reads this long max [500] } mpileup_opt_t; -#define mpileup_set_opt(es,type,key,value) { type tmp = value; mpileup_set(es, key, (void*)&tmp); } +#define mpileup_set_opt(mplp,type,key,value) { type tmp = value; mpileup_set(mplp, key, (void*)&tmp); } +#define mpileup_get_opt(mplp,type,key) (*(type*)mpileup_get(mplp, key)) mpileup_t *mpileup_init(void); -void mpileup_set(mpileup_t *mplp, mpileup_opt_t key, void *value); +int mpileup_set(mpileup_t *mplp, mpileup_opt_t key, void *value); // returns 0 on success +void *mpileup_get(mpileup_t *mplp, mpileup_opt_t key); void mpileup_destroy(mpileup_t *mplp); #endif From 3c53fca4b59b244f8e88418e770bfcc8202b12bd Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 30 Nov 2022 14:49:19 +0000 Subject: [PATCH 4/6] Backup commit - replaced mpileup_set_opt macro with mpileup_set stdarg function - compiles - prints a simple pileup on the screen --- mpileup2.c | 228 ++++++--------- mpileup2/mpileup.c | 708 ++++++++++++++++++++++++++++++++++++++++++--- mpileup2/mpileup.h | 18 +- 3 files changed, 753 insertions(+), 201 deletions(-) diff --git a/mpileup2.c b/mpileup2.c index 3d3ad00ab..1198877c6 100644 --- a/mpileup2.c +++ b/mpileup2.c @@ -22,20 +22,6 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ - -/* - -Can't decide how much of the functionality should stay in bcftools and what should be moved to htslib and -controlled via mpileup_set_option() - - list BAM files so that cumbersome nplp,plp,nsmpl,files,nfiles variables can be avoided on user side; - check the BAM header consistency (this is currently not done by mpileup) - - list of read groups to include or exclude, possibly with sample renaming, as in mpileup -G - - list of samples to include or exclude, as in mpileup -S - - targets and regions, as in mpileup -R/-T/-r/-t - -*/ - - #include #include #include @@ -75,7 +61,7 @@ typedef struct { int argc; char **argv, *output_fname; - int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, flag; + int flag; int output_type, clevel, max_read_len; int record_cmd_line; mpileup_t *mplp; @@ -84,26 +70,14 @@ typedef struct } args_t; - -// Data shared by all bam files -typedef struct { - int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth, - max_indel_depth, max_read_len, fmt_flag, ambig_reads; - int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type; - int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels - double indel_bias; - char *reg_fname, *pl_list, *fai_fname, *output_fname; - int reg_is_file, record_cmd_line, n_threads, clevel; - int bed_logic; // 1: include region, 0: exclude region - - // auxiliary structures for calling - kstring_t buf; - bcf1_t *bcf_rec; - htsFile *bcf_fp; - bcf_hdr_t *bcf_hdr; - int argc; - char **argv; -} mplp_conf_t; +static int run_mpileup(args_t *args) +{ + int ret; + while ( (ret=mpileup_next(args->mplp))==1 ) + { + } + return ret; +} #define SET_FMT_FLAG(str,bit,msg) \ if (!strcasecmp(tag,str) || !strcasecmp(tag,"FMT/"str) || !strcasecmp(tag,"FORMAT/"str)) \ @@ -215,10 +189,10 @@ static void list_annotations(FILE *fp) static void print_usage(args_t *args, FILE *fp) { - char *tmp_skip_all_set = bam_flag2str(args->rflag_skip_all_set); - char *tmp_skip_any_unset = bam_flag2str(args->rflag_skip_any_unset); - char *tmp_skip_all_unset = bam_flag2str(args->rflag_skip_all_unset); - char *tmp_skip_any_set = bam_flag2str(args->rflag_skip_any_set); + char *tmp_skip_all_set = bam_flag2str(mpileup_get_opt(args->mplp, int, SKIP_ALL_SET)); + char *tmp_skip_any_unset = bam_flag2str(mpileup_get_opt(args->mplp, int, SKIP_ANY_UNSET)); + char *tmp_skip_all_unset = bam_flag2str(mpileup_get_opt(args->mplp, int, SKIP_ALL_UNSET)); + char *tmp_skip_any_set = bam_flag2str(mpileup_get_opt(args->mplp, int, SKIP_ANY_SET)); fprintf(fp, "\n" @@ -231,7 +205,7 @@ static void print_usage(args_t *args, FILE *fp) " -B, --no-BAQ Disable BAQ (per-Base Alignment Quality)\n" " -C, --adjust-MQ INT Adjust mapping quality [0]\n" " -D, --full-BAQ Apply BAQ everywhere, not just in problematic regions\n" - " -d, --max-depth INT Max raw per-file depth; avoids excessive memory usage [%d]\n", mpileup_get_opt(args->mplp, int, MAX_DP_PER_FILE)); + " -d, --max-depth INT Max raw per-sample depth; avoids excessive memory usage [%d]\n", mpileup_get_opt(args->mplp, int, MAX_DP_PER_SAMPLE)); fprintf(fp, " -E, --redo-BAQ Recalculate BAQ on the fly, ignore existing BQs\n" " -f, --fasta-ref FILE Faidx indexed reference sequence file\n" @@ -317,7 +291,7 @@ static void print_usage(args_t *args, FILE *fp) free(tmp_skip_any_set); } -static void exit_nicely(args_t *args) +static void clean_up(args_t *args) { mpileup_destroy(args->mplp); free(args); @@ -328,23 +302,23 @@ int main_mpileup2(int argc, char *argv[]) int c, ret = 0; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; - args->rflag_skip_any_set = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; args->flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL | MPLP_SMART_OVERLAPS; args->bca.openQ = 40; args->bca.extQ = 20; args->bca.tandemQ = 500; - args->mplp = mpileup_init(); - mpileup_set_opt(args->mplp, int, MAX_DP_PER_FILE, 250); - mpileup_set_opt(args->mplp, int, MIN_MQ, 0); - mpileup_set_opt(args->mplp, int, MAX_BQ, 60); - mpileup_set_opt(args->mplp, int, DELTA_BQ, 30); - mpileup_set_opt(args->mplp, float, MIN_REALN_FRAC, 0.05); - mpileup_set_opt(args->mplp, int, MIN_REALN_DP, 2); - mpileup_set_opt(args->mplp, int, MAX_REALN_DP, 250); - mpileup_set_opt(args->mplp, int, MAX_REALN_LEN, 500); + args->mplp = mpileup_alloc(); + mpileup_set(args->mplp, MAX_DP_PER_SAMPLE, 250); + mpileup_set(args->mplp, MIN_MQ, 0); + mpileup_set(args->mplp, MAX_BQ, 60); + mpileup_set(args->mplp, DELTA_BQ, 30); + mpileup_set(args->mplp, MIN_REALN_FRAC, 0.05); + mpileup_set(args->mplp, MIN_REALN_DP, 2); + mpileup_set(args->mplp, MAX_REALN_DP, 250); + mpileup_set(args->mplp, MAX_REALN_LEN, 500); + mpileup_set(args->mplp, SKIP_ANY_SET, BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); - int noref = 0; + hts_srand48(0); #if 0 const char *file_list = NULL; @@ -372,7 +346,6 @@ int main_mpileup2(int argc, char *argv[]) mplp.max_read_len = 500; mplp.indel_win_size = 110; mplp.clevel = -1; - hts_srand48(0); #endif static const struct option lopts[] = @@ -447,57 +420,51 @@ int main_mpileup2(int argc, char *argv[]) }; while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) { switch (c) { - case 'x': mpileup_set_opt(args->mplp, int, SMART_OVERLAPS, 0); break; + case 'x': mpileup_set(args->mplp, SMART_OVERLAPS, 0); break; case 16 : - args->rflag_skip_any_unset = bam_str2flag(optarg); - if ( args->rflag_skip_any_unset <0 ) { fprintf(stderr,"Could not parse --nf %s\n", optarg); return 1; } + { + int flag = bam_str2flag(optarg); + if ( flag<0 ) { fprintf(stderr,"Could not parse --nu %s\n", optarg); return 1; } + mpileup_set(args->mplp, SKIP_ANY_UNSET, flag); break; + } case 17 : - args->rflag_skip_all_unset = bam_str2flag(optarg); - if ( args->rflag_skip_all_unset<0 ) { fprintf(stderr,"Could not parse --if %s\n", optarg); return 1; } + { + int flag = bam_str2flag(optarg); + if ( flag<0 ) { fprintf(stderr,"Could not parse --lu %s\n", optarg); return 1; } + mpileup_set(args->mplp, SKIP_ALL_UNSET, flag); break; + } case 18 : - args->rflag_skip_any_set = bam_str2flag(optarg); - if ( args->rflag_skip_any_set <0 ) { fprintf(stderr,"Could not parse --ef %s\n", optarg); return 1; } + { + int flag = bam_str2flag(optarg); + if ( flag<0 ) { fprintf(stderr,"Could not parse --ns %s\n", optarg); return 1; } + mpileup_set(args->mplp, SKIP_ANY_SET, flag); break; + } case 19 : - args->rflag_skip_all_set = bam_str2flag(optarg); - if ( args->rflag_skip_all_set <0 ) { fprintf(stderr,"Could not parse --df %s\n", optarg); return 1; } + { + int flag = bam_str2flag(optarg); + if ( flag<0 ) { fprintf(stderr,"Could not parse --ls %s\n", optarg); return 1; } + mpileup_set(args->mplp, SKIP_ALL_SET, flag); break; + } case 3 : args->output_fname = optarg; break; case 4 : args->bca.openQ = atoi(optarg); break; - case 5 : mpileup_set_opt(args->mplp, int, SMPL_IGNORE_RG, 1); break; + case 5 : mpileup_set(args->mplp, SMPL_IGNORE_RG, 1); break; case 'g': args->gvcf = gvcf_init(optarg); if ( !args->gvcf ) error("Could not parse: --gvcf %s\n", optarg); break; - case 'f': mpileup_set_opt(args->mplp, char*, FASTA_REF, optarg); break; - case 7 : noref = 1; break; + case 'f': mpileup_set(args->mplp, FASTA_REF, optarg); break; + case 7 : error("todo: --no-reference"); /*noref = 1;*/ break; case 8 : args->record_cmd_line = 0; break; case 9 : error("todo: --threads\n"); /* note this was used *only* for output VCF compression, practically useless, moreover -Ou is recommended... */ break; - case 'd': mpileup_set_opt(args->mplp, int, MAX_DP_PER_FILE, atoi(optarg)); break; - case 'r': mpileup_set_opt(args->mplp, char*, REGIONS, optarg); break; - case 'R': mpileup_set_opt(args->mplp, char*, REGIONS_FNAME, optarg); break; - case 't': mpileup_set_opt(args->mplp, char*, TARGETS, optarg); - // // In the original version the whole BAM was streamed which is inefficient - // // with few BED intervals and big BAMs. Todo: devise a heuristic to determine - // // best strategy, that is streaming or jumping. - // if ( optarg[0]=='^' ) optarg++; - // else mplp.bed_logic = 1; - // mplp.bed = regidx_init(NULL,regidx_parse_reg,NULL,0,NULL); - // mplp.bed_itr = regitr_init(mplp.bed); - // if ( regidx_insert_list(mplp.bed,optarg,',') !=0 ) - // { - // fprintf(stderr,"Could not parse the targets: %s\n", optarg); - // exit(EXIT_FAILURE); - // } - break; - case 'T': mpileup_set_opt(args->mplp, char*, TARGETS_FNAME, optarg); - // if ( optarg[0]=='^' ) optarg++; - // else mplp.bed_logic = 1; - // mplp.bed = regidx_init(optarg,NULL,NULL,0,NULL); - // if (!mplp.bed) { fprintf(stderr, "bcftools mpileup: Could not read file \"%s\"", optarg); return 1; } - break; + case 'd': mpileup_set(args->mplp, MAX_DP_PER_SAMPLE, atoi(optarg)); break; + case 'r': mpileup_set(args->mplp, REGIONS, optarg); break; + case 'R': mpileup_set(args->mplp, REGIONS_FNAME, optarg); break; + case 't': mpileup_set(args->mplp, TARGETS, optarg); break; + case 'T': mpileup_set(args->mplp, TARGETS_FNAME, optarg); break; case 'P': error("todo: --platforms\n"); /* this was never used: mplp.pl_list = strdup(optarg); */ break; case 'p': args->flag |= MPLP_PER_SAMPLE; break; case 'B': args->flag &= ~MPLP_REALN; break; @@ -505,8 +472,8 @@ int main_mpileup2(int argc, char *argv[]) case 'I': args->flag |= MPLP_NO_INDEL; break; case 'E': args->flag |= MPLP_REDO_BAQ; break; case '6': args->flag |= MPLP_ILLUMINA13; break; - case 's': mpileup_set_opt(args->mplp, char*, SAMPLES, optarg); break; - case 'S': mpileup_set_opt(args->mplp, char*, SAMPLES_FNAME, optarg); break; + case 's': mpileup_set(args->mplp, SAMPLES, optarg); break; + case 'S': mpileup_set(args->mplp, SAMPLES_FNAME, optarg); break; case 'O': switch (optarg[0]) { case 'b': args->output_type = FT_BCF_GZ; break; @@ -527,12 +494,12 @@ int main_mpileup2(int argc, char *argv[]) if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --output-type %s\n", optarg+1); } break; - case 'C': mpileup_set_opt(args->mplp, int, ADJUST_MQ, atoi(optarg)); break; - case 'q': mpileup_set_opt(args->mplp, int, MIN_MQ, atoi(optarg)); break; - case 'Q': mpileup_set_opt(args->mplp, int, MIN_BQ, atoi(optarg)); break; - case 11: mpileup_set_opt(args->mplp, int, MAX_BQ, atoi(optarg)); break; - case 12: mpileup_set_opt(args->mplp, int, DELTA_BQ, atoi(optarg)); break; - case 'b': if ( mpileup_set(args->mplp, BAM_FNAME, &optarg)!=0 ) exit_nicely(args); break; + case 'C': mpileup_set(args->mplp, ADJUST_MQ, atoi(optarg)); break; + case 'q': mpileup_set(args->mplp, MIN_MQ, atoi(optarg)); break; + case 'Q': mpileup_set(args->mplp, MIN_BQ, atoi(optarg)); break; + case 11: mpileup_set(args->mplp, MAX_BQ, atoi(optarg)); break; + case 12: mpileup_set(args->mplp, DELTA_BQ, atoi(optarg)); break; + case 'b': if ( mpileup_set(args->mplp, BAM_FNAME, optarg)!=0 ) clean_up(args); break; case 'o': /* this option used to be -o INT and -o FILE. In the new code disable -o as an alias of --open-prob as its use is very rare */ args->output_fname = optarg; @@ -557,10 +524,10 @@ int main_mpileup2(int argc, char *argv[]) } break; case 'A': args->flag &= ~MPLP_NO_ORPHAN; break; - case 'F': mpileup_set_opt(args->mplp, float, MIN_REALN_FRAC, atof(optarg)); break; - case 'm': mpileup_set_opt(args->mplp, int, MIN_REALN_DP, atoi(optarg)); break; - case 'L': mpileup_set_opt(args->mplp, int, MAX_REALN_DP, atoi(optarg)); break; - case 'G': mpileup_set_opt(args->mplp, char*, READ_GROUPS_FNAME, optarg); break; + case 'F': mpileup_set(args->mplp, MIN_REALN_FRAC, atof(optarg)); break; + case 'm': mpileup_set(args->mplp, MIN_REALN_DP, atoi(optarg)); break; + case 'L': mpileup_set(args->mplp, MAX_REALN_DP, atoi(optarg)); break; + case 'G': mpileup_set(args->mplp, READ_GROUPS_FNAME, optarg); break; case 'a': if (optarg[0]=='?') { list_annotations(stderr); @@ -571,26 +538,26 @@ int main_mpileup2(int argc, char *argv[]) case 'M': args->max_read_len = atoi(optarg); break; case 'X': if (strcasecmp(optarg, "pacbio-ccs") == 0) { - mpileup_set_opt(args->mplp, float, MIN_REALN_FRAC, 0.1); - mpileup_set_opt(args->mplp, int, MIN_BQ, 5); - mpileup_set_opt(args->mplp, int, MAX_BQ, 50); - mpileup_set_opt(args->mplp, int, DELTA_BQ, 10); + mpileup_set(args->mplp, MIN_REALN_FRAC, 0.1); + mpileup_set(args->mplp, MIN_BQ, 5); + mpileup_set(args->mplp, MAX_BQ, 50); + mpileup_set(args->mplp, DELTA_BQ, 10); args->bca.openQ = 25; args->bca.extQ = 1; args->flag |= MPLP_REALN_PARTIAL; - mpileup_set_opt(args->mplp, int, MAX_REALN_LEN, 99999); + mpileup_set(args->mplp, MAX_REALN_LEN, 99999); } else if (strcasecmp(optarg, "ont") == 0) { fprintf(stderr, "For ONT it may be beneficial to also run bcftools call with " "a higher -P, eg -P0.01 or -P 0.1\n"); - mpileup_set_opt(args->mplp, int, MIN_BQ, 5); - mpileup_set_opt(args->mplp, int, MAX_BQ, 30); + mpileup_set(args->mplp, MIN_BQ, 5); + mpileup_set(args->mplp, MAX_BQ, 30); args->flag &= ~MPLP_REALN; args->flag |= MPLP_NO_INDEL; } else if (strcasecmp(optarg, "1.12") == 0) { // 1.12 and earlier - mpileup_set_opt(args->mplp, float, MIN_REALN_FRAC, 0.002); + mpileup_set(args->mplp, MIN_REALN_FRAC, 0.002); args->bca.min_support = 1; - mpileup_set_opt(args->mplp, int, MIN_BQ, 13); + mpileup_set(args->mplp, MIN_BQ, 13); args->bca.tandemQ = 100; args->flag &= ~MPLP_REALN_PARTIAL; args->flag |= MPLP_REALN; @@ -621,49 +588,16 @@ int main_mpileup2(int argc, char *argv[]) if ( argc==1 ) { print_usage(args, stderr); - exit_nicely(args); + clean_up(args); return 1; } int i; for (i=0; implp, BAM, &argv[optind+i])!=0 ) exit_nicely(args); - - exit_nicely(args); - -#if 0 - if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ ) - { - fprintf(stderr,"Error: The -B option cannot be combined with -E\n"); - return 1; - } - if (!mplp.fai && !noref) { - fprintf(stderr,"Error: mpileup requires the --fasta-ref option by default; use --no-reference to run without a fasta reference\n"); - return 1; - } - int ret,i; - if (file_list) - { - if ( read_file_list(file_list,&nfiles,&fn) ) return 1; - mplp.files = fn; - mplp.nfiles = nfiles; - } - else - { - mplp.nfiles = argc - optind; - mplp.files = (char**) malloc(mplp.nfiles*sizeof(char*)); - for (i=0; implp, BAM, argv[optind+i])!=0 ) clean_up(args); + mpileup_init(args->mplp); - for (i=0; i #include #include #include #include #include +#include #include #include "mpileup.h" +#include "bam_sample.h" -static int mplp_add_bam(mpileup_t *mplp, char *bam, int is_file); +typedef struct +{ + int icig; // position in the cigar string + int iseq; // the cigar[icig] operation refers to seq[iseq] + hts_pos_t ref_pos; // ref coordinate of the first base (if the current icig op consumes ref) +} +cigar_state_t; + +typedef struct +{ + bam1_t *bam; + cigar_state_t cstate; + uint32_t + next:31, // index of the next node, rbuf->tail's next points to itself + is_set:1; // if is_set==0 && next==0, the node is unitialized in a newly allocated block +} +rb_node_t; + +typedef struct +{ + int tid; + hts_pos_t pos; // current position + int len; // keep reads overlapping pos+len + rb_node_t *buf; // the reads, unordered and with gaps, linked through 'next' indexes + int n, m; // the number of buffer elements used and allocated + int head, tail; // the index of the first and last used record + int empty; // the index of the first unused record + int nins, ndel, nbase; +} +read_buffer_t; + +typedef struct +{ + samFile *fp; + bam_hdr_t *hdr; + hts_itr_t *iter; + char *fname; + int bam_id; + bam1_t *cached_rec; + int cached_smpl; + int32_t tid; + hts_pos_t pos; +} +bam_aux_t; -struct _mpileup_t +struct mpileup_t_ { char *fasta_ref, *regions, *regions_fname, *targets, *targets_fname, *samples, *samples_fname, *read_groups_fname; - int smart_overlaps, smpl_ignore_rg, max_dp_per_file, adjust_mq, min_mq, min_bq, max_bq, delta_bq; - int min_realn_dp, max_realn_dp, max_realn_len; + int smart_overlaps, smpl_ignore_rg, max_dp_per_sample, adjust_mq, min_mq, min_bq, max_bq, delta_bq; + int min_realn_dp, max_realn_dp, max_realn_len, skip_any_unset, skip_all_unset, skip_any_set, skip_all_set; float min_realn_frac; + int nbam, nbam_names; char **bam_names; - int nbam_names; + bam_aux_t *bam; + bam_hdr_t *hdr; + bam_smpl_t *bsmpl; + read_buffer_t *read_buf; // read buffer for each sample, nsmpl elements + int nsmpl; + int32_t tid; + hts_pos_t pos; + bam_pileup1_t **plp; + int *nplp; }; -mpileup_t *mpileup_init(void) +static int mplp_add_bam(mpileup_t *mplp, char *bam, int is_file); +static void read_buffer_clear(read_buffer_t *rbuf); + +mpileup_t *mpileup_alloc(void) { mpileup_t *mplp = (mpileup_t*) calloc(1,sizeof(mpileup_t)); - mpileup_set_opt(mplp,int,MAX_DP_PER_FILE,250); - mpileup_set_opt(mplp,int,MIN_BQ,1); - mpileup_set_opt(mplp,int,MAX_BQ,60); - mpileup_set_opt(mplp,int,DELTA_BQ,30); - mpileup_set_opt(mplp,float,MIN_REALN_FRAC,0.05); - mpileup_set_opt(mplp,int,MIN_REALN_DP,2); - mpileup_set_opt(mplp,int,MAX_REALN_DP,250); - mpileup_set_opt(mplp,int,MAX_REALN_LEN,500); + mpileup_set(mplp,MAX_DP_PER_SAMPLE,250); + mpileup_set(mplp,MIN_BQ,1); + mpileup_set(mplp,MAX_BQ,60); + mpileup_set(mplp,DELTA_BQ,30); + mpileup_set(mplp,MIN_REALN_FRAC,0.05); + mpileup_set(mplp,MIN_REALN_DP,2); + mpileup_set(mplp,MAX_REALN_DP,250); + mpileup_set(mplp,MAX_REALN_LEN,500); + mpileup_set(mplp,SKIP_ANY_SET,BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP); + mplp->pos = -1; + mplp->tid = -1; return mplp; } -int mpileup_set(mpileup_t *mplp, mpileup_opt_t key, void *value) +int mpileup_set(mpileup_t *mplp, mpileup_opt_t key, ...) { + va_list args; switch (key) { - case FASTA_REF: free(mplp->fasta_ref); mplp->fasta_ref = strdup(*((char**)value)); break; - case BAM: return mplp_add_bam(mplp,*((char**)value),0); break; - case BAM_FNAME: return mplp_add_bam(mplp,*((char**)value),1); break; - case REGIONS: free(mplp->regions); mplp->regions = strdup(*((char**)value)); break; - case REGIONS_FNAME: free(mplp->regions_fname); mplp->regions_fname = strdup(*((char**)value)); break; - case TARGETS: free(mplp->targets); mplp->targets = strdup(*((char**)value)); break; - case TARGETS_FNAME: free(mplp->targets_fname); mplp->targets_fname = strdup(*((char**)value)); break; - case SAMPLES: free(mplp->samples); mplp->samples = strdup(*((char**)value)); break; - case SAMPLES_FNAME: free(mplp->samples_fname); mplp->samples_fname = strdup(*((char**)value)); break; - case READ_GROUPS_FNAME: free(mplp->read_groups_fname); mplp->read_groups_fname = strdup(*((char**)value)); break; - case SMART_OVERLAPS: mplp->smart_overlaps = *((int*)value); break; - case SMPL_IGNORE_RG: mplp->smpl_ignore_rg = *((int*)value); break; - case MAX_DP_PER_FILE: mplp->max_dp_per_file = *((int*)value); break; - case ADJUST_MQ: mplp->adjust_mq = *((int*)value); break; - case MIN_MQ: mplp->min_mq = *((int*)value); break; - case MIN_BQ: mplp->min_bq = *((int*)value); break; - case MAX_BQ: mplp->max_bq = *((int*)value); break; - case DELTA_BQ: mplp->delta_bq = *((int*)value); break; - case MIN_REALN_FRAC: mplp->min_realn_frac = *((float*)value); break; - case MIN_REALN_DP: mplp->min_realn_dp = *((int*)value); break; - case MAX_REALN_DP: mplp->max_realn_dp = *((int*)value); break; - case MAX_REALN_LEN: mplp->max_realn_len = *((int*)value); break; - default: hts_log_error("Todo: mpileup_set key=%d",(int)key); return 1; break; + case FASTA_REF: + free(mplp->fasta_ref); + va_start(args, key); + mplp->fasta_ref = strdup(va_arg(args,char*)); + va_end(args); + return mplp->fasta_ref ? 0 : -1; + + case BAM: + { + va_start(args, key); + char *bam = va_arg(args,char*); + va_end(args); + return mplp_add_bam(mplp,bam,0); + } + + case BAM_FNAME: + { + va_start(args, key); + char *bam = va_arg(args,char*); + va_end(args); + return mplp_add_bam(mplp,bam,1); + } + + case REGIONS: + free(mplp->regions); + va_start(args, key); + mplp->regions = strdup(va_arg(args,char*)); + va_end(args); + return mplp->regions ? 0 : -1; + + case REGIONS_FNAME: + free(mplp->regions_fname); + va_start(args, key); + mplp->regions_fname = strdup(va_arg(args,char*)); + va_end(args); + return mplp->regions_fname ? 0 : -1; + + case TARGETS: + free(mplp->targets); + va_start(args, key); + mplp->targets = strdup(va_arg(args,char*)); + va_end(args); + return mplp->targets ? 0 : -1; + + case TARGETS_FNAME: + free(mplp->targets_fname); + va_start(args, key); + mplp->targets_fname = strdup(va_arg(args,char*)); + va_end(args); + return mplp->targets_fname ? 0 : -1; + + case SAMPLES: + free(mplp->samples); + va_start(args, key); + mplp->samples = strdup(va_arg(args,char*)); + va_end(args); + return mplp->samples ? 0 : -1; + + case SAMPLES_FNAME: + free(mplp->samples_fname); + va_start(args, key); + mplp->samples_fname = strdup(va_arg(args,char*)); + va_end(args); + return mplp->samples_fname ? 0 : -1; + + case READ_GROUPS_FNAME: + free(mplp->read_groups_fname); + va_start(args, key); + mplp->read_groups_fname = strdup(va_arg(args,char*)); + va_end(args); + return mplp->read_groups_fname ? 0 : -1; + + case SMART_OVERLAPS: + va_start(args, key); + mplp->smart_overlaps = va_arg(args,int); + va_end(args); + return 0; + + case SMPL_IGNORE_RG: + va_start(args, key); + mplp->smpl_ignore_rg = va_arg(args,int); + va_end(args); + return 0; + + case SKIP_ANY_UNSET: + va_start(args, key); + mplp->skip_any_unset = va_arg(args,int); + va_end(args); + return 0; + + case SKIP_ALL_UNSET: + va_start(args, key); + mplp->skip_all_unset = va_arg(args,int); + va_end(args); + return 0; + + case SKIP_ANY_SET: + va_start(args, key); + mplp->skip_any_set = va_arg(args,int); + va_end(args); + return 0; + + case SKIP_ALL_SET: + va_start(args, key); + mplp->skip_all_set = va_arg(args,int); + va_end(args); + return 0; + + case MAX_DP_PER_SAMPLE: + va_start(args, key); + mplp->max_dp_per_sample = va_arg(args,int); + va_end(args); + return 0; + + case ADJUST_MQ: + va_start(args, key); + mplp->adjust_mq = va_arg(args,int); + va_end(args); + return 0; + + case MIN_MQ: + va_start(args, key); + mplp->min_mq = va_arg(args,int); + va_end(args); + return 0; + + case MIN_BQ: + va_start(args, key); + mplp->min_bq = va_arg(args,int); + va_end(args); + return 0; + + case MAX_BQ: + va_start(args, key); + mplp->max_bq = va_arg(args,int); + va_end(args); + return 0; + + case DELTA_BQ: + va_start(args, key); + mplp->delta_bq = va_arg(args,int); + va_end(args); + return 0; + + case MIN_REALN_FRAC: + va_start(args, key); + mplp->min_realn_frac = va_arg(args,double); + va_end(args); + return 0; + + case MIN_REALN_DP: + va_start(args, key); + mplp->min_realn_dp = va_arg(args,int); + va_end(args); + return 0; + + case MAX_REALN_DP: + va_start(args, key); + mplp->max_realn_dp = va_arg(args,int); + va_end(args); + return 0; + + case MAX_REALN_LEN: + va_start(args, key); + mplp->max_realn_len = va_arg(args,int); + va_end(args); + return 0; + + default: + hts_log_error("Todo: mpileup_set key=%d",(int)key); + return 1; + break; } return 0; } @@ -95,7 +306,7 @@ void *mpileup_get(mpileup_t *mplp, mpileup_opt_t key) { case SMART_OVERLAPS: return &mplp->smart_overlaps; break; case SMPL_IGNORE_RG: return &mplp->smpl_ignore_rg; break; - case MAX_DP_PER_FILE: return &mplp->max_dp_per_file; break; + case MAX_DP_PER_SAMPLE: return &mplp->max_dp_per_sample; break; case ADJUST_MQ: return &mplp->adjust_mq; break; case MIN_MQ: return &mplp->min_mq; break; case MIN_BQ: return &mplp->min_bq; break; @@ -105,6 +316,10 @@ void *mpileup_get(mpileup_t *mplp, mpileup_opt_t key) case MIN_REALN_DP: return &mplp->min_realn_dp; break; case MAX_REALN_DP: return &mplp->max_realn_dp; break; case MAX_REALN_LEN: return &mplp->max_realn_len; break; + case SKIP_ANY_UNSET: return &mplp->skip_any_unset; break; + case SKIP_ALL_UNSET: return &mplp->skip_all_unset; break; + case SKIP_ANY_SET: return &mplp->skip_any_set; break; + case SKIP_ALL_SET: return &mplp->skip_all_set; break; default: hts_log_error("Todo: mpileup_get key=%d",(int)key); return NULL; break; } return NULL; @@ -113,8 +328,20 @@ void *mpileup_get(mpileup_t *mplp, mpileup_opt_t key) void mpileup_destroy(mpileup_t *mplp) { int i; + for (i=0; inbam; i++) + { + sam_close(mplp->bam[i].fp); + if ( mplp->bam[i].cached_rec ) bam_destroy1(mplp->bam[i].cached_rec); + } for (i=0; inbam_names; i++) free(mplp->bam_names[i]); + free(mplp->bam); free(mplp->bam_names); + for (i=0; insmpl; i++) read_buffer_clear(&mplp->read_buf[i]); + free(mplp->read_buf); + free(mplp->plp); + free(mplp->nplp); + bam_hdr_destroy(mplp->hdr); + if ( mplp->bsmpl ) bam_smpl_destroy(mplp->bsmpl); free(mplp->fasta_ref); free(mplp->regions); free(mplp->regions_fname); @@ -156,29 +383,416 @@ static int mplp_add_bam(mpileup_t *mplp, char *bam, int is_file) hts_log_error("Does the file \"%s\" really contain a list of files and do all exist?", bam); break; } - mplp->nbam_names++; - char **tmp = (char**)realloc(mplp->bam_names,mplp->nbam_names*sizeof(*mplp->bam_names)); + char **tmp = (char**)realloc(mplp->bam_names,(mplp->nbam_names+1)*sizeof(*mplp->bam_names)); if ( !tmp ) { - hts_log_error("Could not allocate %zu bytes",mplp->nbam_names*sizeof(*mplp->bam_names)); + hts_log_error("Could not allocate %zu bytes",(mplp->nbam_names+1)*sizeof(*mplp->bam_names)); break; } mplp->bam_names = tmp; - mplp->bam_names[mplp->nbam_names-1] = strdup(names[i]); + mplp->bam_names[mplp->nbam_names++] = strdup(names[i]); } for (i=0; inbam_names++; - char **tmp = (char**)realloc(mplp->bam_names,mplp->nbam_names*sizeof(*mplp->bam_names)); + char **tmp = (char**)realloc(mplp->bam_names,(mplp->nbam_names+1)*sizeof(*mplp->bam_names)); if ( !tmp ) { - hts_log_error("Could not allocate %zu bytes",mplp->nbam_names*sizeof(*mplp->bam_names)); + hts_log_error("Could not allocate %zu bytes",(mplp->nbam_names+1)*sizeof(*mplp->bam_names)); return 1; } mplp->bam_names = tmp; - mplp->bam_names[mplp->nbam_names-1] = strdup(bam); + mplp->bam_names[mplp->nbam_names++] = strdup(bam); + return 0; +} + +static int mplp_check_header_contigs(mpileup_t *mplp, bam_hdr_t *hdr) +{ + static int warned = 0; + if ( !warned ) hts_log_error("todo: mplp_check_header_contigs"); + warned = 1; + return 0; +} +int mpileup_init(mpileup_t *mplp) +{ + mplp->bsmpl = bam_smpl_init(); + if ( mplp->smpl_ignore_rg ) bam_smpl_ignore_readgroups(mplp->bsmpl); + if ( mplp->read_groups_fname ) bam_smpl_add_readgroups(mplp->bsmpl, mplp->read_groups_fname, 1); + if ( mplp->samples && !bam_smpl_add_samples(mplp->bsmpl,mplp->samples,0) ) + { + hts_log_error("Could not parse samples: %s", mplp->samples); + return 1; + } + if ( mplp->samples_fname && !bam_smpl_add_samples(mplp->bsmpl,mplp->samples_fname,0) ) + { + hts_log_error("Could not read samples: %s", mplp->samples_fname); + return 1; + } + + mplp->bam = (bam_aux_t*)calloc(mplp->nbam_names,sizeof(*mplp->bam)); + if ( !mplp->bam ) + { + hts_log_error("Could not allocate %zu bytes",mplp->nbam_names*sizeof(*mplp->bam)); + return 1; + } + int i; + for (i=0; inbam_names; i++) + { + char *fname = mplp->bam_names[i]; + bam_aux_t *baux = &mplp->bam[mplp->nbam]; + baux->cached_smpl = -1; + baux->fname = fname; + baux->fp = sam_open(fname,"rb"); + if ( !baux->fp ) + { + hts_log_error("Failed to open %s: %s",fname,strerror(errno)); + return 1; + } + if ( hts_set_opt(baux->fp, CRAM_OPT_DECODE_MD, 1) ) + { + hts_log_error("Failed to set CRAM_OPT_DECODE_MD value: %s\n",fname); + return 1; + } + if ( mplp->fasta_ref && hts_set_fai_filename(baux->fp,mplp->fasta_ref)!=0 ) + { + hts_log_error("Failed to process %s: %s\n",mplp->fasta_ref, strerror(errno)); + return 1; + } + // baux->conf = conf; + // baux->ref = &mp_ref; + bam_hdr_t *hdr = sam_hdr_read(baux->fp); + if ( !hdr ) + { + hts_log_error("Failed to read the header of %s\n",fname); + return 1; + } + baux->bam_id = bam_smpl_add_bam(mplp->bsmpl,hdr->text,fname); + if ( baux->bam_id < 0 ) + { + // no usable read groups in this bam, can be skipped + sam_close(baux->fp); + bam_hdr_destroy(hdr); + } + mplp->nbam++; + if ( !mplp->hdr ) mplp->hdr = hdr; + else + { + int ret = mplp_check_header_contigs(mplp,hdr); + bam_hdr_destroy(hdr); + if ( ret!=0 ) return 1; + } + baux->hdr = mplp->hdr; + baux->tid = -1; + } + bam_smpl_get_samples(mplp->bsmpl,&mplp->nsmpl); + if ( !mplp->nsmpl ) + { + hts_log_error("Failed to initialize samples"); + return 1; + } + mplp->read_buf = (read_buffer_t*) calloc(mplp->nsmpl,sizeof(*mplp->read_buf)); + mplp->plp = (bam_pileup1_t**) calloc(mplp->nsmpl,sizeof(*mplp->plp)); + mplp->nplp = (int*) calloc(mplp->nsmpl,sizeof(*mplp->nplp)); + + return 0; +} + +void read_buffer_clear(read_buffer_t *rbuf) +{ + int i; + for (i=0; im; i++) + { + if ( rbuf->buf[i].bam ) bam_destroy1(rbuf->buf[i].bam); + } + free(rbuf->buf); + memset(rbuf,0,sizeof(*rbuf)); +} +/* + * Returns + * 0 .. on success (pushed onto the buffer, swapped rec for NULL or an empty record) + * 1 .. cannot push (different tid or not ready for the current position yet) + * 2 .. cannot push (past the current position) + * -1 .. error + */ +int read_buffer_push(read_buffer_t *rbuf, hts_pos_t pos, bam1_t **rec_ptr) +{ +static int warned = 0; +if ( !warned ) hts_log_error("read_buffer_push - left softclip vs read start"); +warned = 1; + + bam1_t *rec = *rec_ptr; + if ( rec->core.tid != rbuf->tid ) return 1; + if ( rbuf->len < rec->core.l_qseq ) rbuf->len = rec->core.l_qseq; + if ( rec->core.pos + rbuf->len < pos ) return 2; + if ( rec->core.pos > pos + rbuf->len ) return 2; +//fprintf(stderr,"pushing %d: %d+%d in %d-%d\n",rbuf->n,(int)rec->core.pos,(int)rec->core.l_qseq,(int)pos,rbuf->len); + if ( rbuf->n==rbuf->m ) + { + // Need to increase the buffer size + if ( hts_resize(rb_node_t, rbuf->n + 1, &rbuf->m, &rbuf->buf, HTS_RESIZE_CLEAR)<0 ) + { + hts_log_error("Failed to allocate memory"); + return -1; + } + rbuf->empty = rbuf->n; // the first newly allocated block is the new empty + } + // claim an empty node and populate it + rb_node_t *node = &rbuf->buf[rbuf->empty]; + *rec_ptr = node->bam; + node->bam = rec; + node->cstate.ref_pos = rec->core.pos; + node->next = rbuf->empty; // the tail's next points to itself + if ( rbuf->n ) + rbuf->buf[rbuf->tail].next = rbuf->empty; + else + rbuf->head = rbuf->empty; + rbuf->tail = rbuf->empty; + + // Assign the head of empty nodes. Note that the latter can go beyond the allocated buffer or nonsense value, + // but that will be set above to a correct value later when the buffer size is increased + rbuf->empty = node->is_set ? node->next : rbuf->empty + 1; + node->is_set = 1; + rbuf->n++; + + if ( pos==0 && rbuf->n==1 ) + { + rbuf->tid = rbuf->buf[rbuf->head].bam->core.tid; + rbuf->pos = rbuf->buf[rbuf->head].bam->core.pos; + } + return 0; } +/* + * Read from a bam and fill a read buffer (separate for each sample) to allow read sub-sampling. + * + * Returns 0 on success (buffers full), 1 when done (i.e. no new reads), or a negative value on error + */ +static int mplp_read_bam(mpileup_t *mplp, bam_aux_t *baux) +{ + // if new mplp position is requested (different tid or mplp pos outside of the last region) + // update the sam iterator baux->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s); + + if ( baux->cached_smpl >= 0 ) + { + read_buffer_t *rbuf = &mplp->read_buf[baux->cached_smpl]; + int ret = read_buffer_push(rbuf, mplp->pos, &baux->cached_rec); + if ( ret<0 ) return ret; // an error (malloc) + if ( ret==1 ) return 0; // cannot push, full buffer, not ready for the current position + baux->cached_smpl = -1; + } + while (1) + { + if ( !baux->cached_rec ) baux->cached_rec = bam_init1(); + int ret = baux->iter ? sam_itr_next(baux->fp, baux->iter, baux->cached_rec) : sam_read1(baux->fp, baux->hdr, baux->cached_rec); + if ( ret<0 ) + { + if ( ret==-1 ) return 1; // no more data + return -1; // error + } + + // Check if the records have tids in ascending order. This is not required by the SAM specification but + // mpileup2 relies on it (for now) + bam1_t *rec = baux->cached_rec; + if ( baux->tid==-1 ) baux->tid = rec->core.tid, baux->pos = rec->core.pos; + else if ( baux->tid > rec->core.tid ) + { + hts_log_error("Failed assumption, sequence tids out of order in %s",baux->fname); + return -1; + } + else if ( baux->pos > rec->core.pos ) + { + hts_log_error("Unsorted alignments in %s",baux->fname); + return -1; + } + baux->pos = rec->core.pos; + + // exclude unmapped reads and any filters provided by the user + if ( rec->core.tid < 0 || (rec->core.flag&BAM_FUNMAP) ) continue; + if ( mplp->skip_any_unset && (mplp->skip_any_unset & rec->core.flag)!=mplp->skip_any_unset ) continue; + if ( mplp->skip_all_set && (mplp->skip_all_set & rec->core.flag)==mplp->skip_all_set ) continue; + if ( mplp->skip_all_unset && !(mplp->skip_all_unset & rec->core.flag) ) continue; + if ( mplp->skip_any_set && (mplp->skip_any_set & rec->core.flag) ) continue; + + // exclude if sample is not wanted + baux->cached_smpl = bam_smpl_get_sample_id(mplp->bsmpl, baux->bam_id, rec); + if ( baux->cached_smpl<0 ) continue; + + read_buffer_t *rbuf = &mplp->read_buf[baux->cached_smpl]; + ret = read_buffer_push(rbuf, mplp->pos, &baux->cached_rec); + if ( ret==1 ) return 0; // cannot push, full buffer, not ready for the current position + if ( ret<0 ) return ret; // an error (malloc) + baux->cached_smpl = -1; + } + return -1; // can never happen +} + +/* + * Returns + * BAM_CMATCH when read overlaps pos and has a base match or mismatch. Sets ipos + * BAM_CINS when the next base is an insertion. Sets ipos to the current base + * BAM_CDEL when the next or overlapping base is a deletion. Sets ipos to the current + * base when the next base is a deletion and to -1 when inside the deletion + * -1 when there is no base to show (e.g. soft clip) + * -2 when the read ends before pos + */ +static int cigar_get_pos(cigar_state_t *cs, bam1_t *bam, hts_pos_t pos, int32_t *ipos) +{ + int ncig = bam->core.n_cigar; + uint32_t *cigar = bam_get_cigar(bam); + while ( cs->ref_pos <= pos ) + { + if ( cs->icig >= ncig ) return -2; // the read ends before pos + int op = cigar[cs->icig] & BAM_CIGAR_MASK; + int len = cigar[cs->icig] >> BAM_CIGAR_SHIFT; + if ( op==BAM_CMATCH || op==BAM_CEQUAL || op==BAM_CDIFF ) + { + hts_pos_t end_pos = cs->ref_pos + len - 1; + if ( end_pos < pos ) // pos not covered by this cigar block + { + cs->ref_pos += len; + cs->iseq += len; + cs->icig++; + continue; + } + *ipos = pos - cs->ref_pos + cs->iseq; + + // if the following base is an insertion, return BAM_CINS + if ( end_pos==pos && cs->icig+1 < ncig ) + { + int next_op = cigar[cs->icig+1] & BAM_CIGAR_MASK; + if ( next_op==BAM_CINS ) return BAM_CINS; + if ( next_op==BAM_CDEL ) return BAM_CDEL; + } + return BAM_CMATCH; + } + if ( op==BAM_CINS || op==BAM_CSOFT_CLIP ) + { + cs->iseq += len; + cs->icig++; + continue; + } + if ( op==BAM_CDEL ) + { + hts_pos_t end_pos = cs->ref_pos + len - 1; + if ( end_pos < pos ) // pos not covered by this cigar block + { + cs->ref_pos += len; + cs->icig++; + continue; + } + *ipos = -1; + return BAM_CDEL; + } + if ( op==BAM_CREF_SKIP ) + { + hts_pos_t end_pos = cs->ref_pos + len - 1; + if ( end_pos < pos ) // pos not covered by this cigar block + { + cs->ref_pos += len; + cs->icig++; + continue; + } + return -1; + } + hts_log_error("todo: CIGAR operator %d", op); + assert(0); + } + return -1; +} + +static int mplp_set_pileup(mpileup_t *mplp, read_buffer_t *rbuf) +{ + if ( !rbuf->n ) return 0; + rbuf->ndel = rbuf->nins = rbuf->nbase = 0; + int iprev = -1, i = rbuf->head; + while ( 1 ) + { + rb_node_t *node = &rbuf->buf[i]; + int32_t ipos; + int ret = cigar_get_pos(&node->cstate, node->bam, mplp->pos, &ipos); + if ( ret==BAM_CMATCH ) // there is a base, match or mismatch + { + uint8_t *seq = bam_get_seq(node->bam); + fprintf(stderr," %c","ACGTN"[seq_nt16_int[bam_seqi(seq,ipos)]]); + rbuf->nbase++; + } + else if ( ret==BAM_CINS ) rbuf->nins++; + else if ( ret==BAM_CDEL && ipos!=-1 ) rbuf->ndel++; + else if ( ret==-2 ) // this read is done, remove the node + { + rbuf->n--; + node->is_set = 0; + int next = node->next; + node->next = rbuf->empty; + rbuf->empty = i; + if ( next==i ) // node that points to itself is the last node in the chain + { + assert( rbuf->tail==i ); + rbuf->tail = iprev; + if ( iprev==-1 ) rbuf->head = iprev; + else rbuf->buf[iprev].next = iprev; + break; + } + else if ( iprev==-1 ) // removing the first node + rbuf->head = next; + else // removing a middle node + rbuf->buf[iprev].next = next; + i = next; + continue; + } + + if ( rbuf->buf[i].next == i ) break; + iprev = i; + i = rbuf->buf[i].next; + } + fprintf(stderr,"\n"); + return 0; +} + +int mpileup_next(mpileup_t *mplp) +{ + int i; + + // if new region is needed + // - advance regitr_loop + // - reset read_buffers + // - reset mplp tid and pos + + // fill buffers + mplp->pos++; + for (i=0; inbam; i++) + { + int ret = mplp_read_bam(mplp, &mplp->bam[i]); + if ( ret<0 ) return ret; // an error occurred + } + + // this would be a good place for realignment + + // find the minimum position if first time here + if ( mplp->tid==-1 ) + { + mplp->tid = INT32_MAX; + for (i=0; insmpl; i++) + { + if ( mplp->tid > mplp->read_buf[i].tid ) mplp->tid = mplp->read_buf[i].tid, mplp->pos = HTS_POS_MAX; + if ( mplp->pos > mplp->read_buf[i].pos ) mplp->pos = mplp->read_buf[i].pos; + } + } + +fprintf(stderr,"tid,pos=%d %d\n",mplp->tid,(int)mplp->pos+1); + + // prepare the pileup + int nreads = 0; + for (i=0; insmpl; i++) + { + int ret = mplp_set_pileup(mplp, &mplp->read_buf[i]); + if ( ret!=0 ) return -1; + nreads += mplp->read_buf[i].n; + } + return nreads>0 /* || nregions */ ? 1 : 0; +} + + + diff --git a/mpileup2/mpileup.h b/mpileup2/mpileup.h index 08edcae7e..164484b48 100644 --- a/mpileup2/mpileup.h +++ b/mpileup2/mpileup.h @@ -27,10 +27,9 @@ #ifndef __MPILEUP20_H__ #define __MPILEUP20_H__ -typedef struct _mpileup_t mpileup_t; +typedef struct mpileup_t_ mpileup_t; -// Various options. Key marked as 'can fail' shoud be set with mpileup_set() and the -// return status checked as these can fail. +// Various options. For keys marked as 'can fail' the return status should be checked as these can fail typedef enum { // required @@ -50,9 +49,13 @@ typedef enum // bit flags SMART_OVERLAPS, // int {0,1}, 1:disable read-pair overlap detection [0] SMPL_IGNORE_RG, // int {0,1}, 1:ignore read groups, one bam = one sample [0] + SKIP_ANY_UNSET, // skip a read if any of the bits is not set [0] + SKIP_ALL_UNSET, // skip a read if all these bits are not set [0] + SKIP_ANY_SET, // skip a read if any of the bits is set [BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP] + SKIP_ALL_SET, // skip a read if all these bits are set [0] // numeric parameters - MAX_DP_PER_FILE, // int, maximum depth per file, regardless of the number of samples [250] + MAX_DP_PER_SAMPLE, // int, maximum depth per sample [250] ADJUST_MQ, // int, --adjust-MQ value for downgrading reads with excessive mismatches, 0 to disable [0] MIN_MQ, // int, skip alignments with mapQ smaller than MIN_MQ [0] MIN_BQ, // int, skip bases with baseQ smaller than MIN_BQ [1] @@ -65,12 +68,13 @@ typedef enum } mpileup_opt_t; -#define mpileup_set_opt(mplp,type,key,value) { type tmp = value; mpileup_set(mplp, key, (void*)&tmp); } #define mpileup_get_opt(mplp,type,key) (*(type*)mpileup_get(mplp, key)) -mpileup_t *mpileup_init(void); -int mpileup_set(mpileup_t *mplp, mpileup_opt_t key, void *value); // returns 0 on success +mpileup_t *mpileup_alloc(void); +int mpileup_set(mpileup_t *mplp, mpileup_opt_t key, ...); // returns 0 on success void *mpileup_get(mpileup_t *mplp, mpileup_opt_t key); +int mpileup_init(mpileup_t *mplp); // inits regions, bams, iterators, returns 0 on success +int mpileup_next(mpileup_t *mplp); // returns 1 on next position, 0 when done, negative value on error void mpileup_destroy(mpileup_t *mplp); #endif From 8493d9687553c22717af8213040015f6cbe9bafe Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 26 Jun 2023 16:29:34 +0100 Subject: [PATCH 5/6] backup commit: finish regions vgr $bt mpileup2 -f test/mpileup/mpileup.ref.fa test/mpileup/mpileup.1.bam test/mpileup/mpileup.2.bam -o /dev/null -r 17:10,17:16 2>&1 | less --- mpileup2.c | 2 +- mpileup2/mpileup.c | 188 +++++++++++++++++++++++++++++++++++++-------- mpileup2/mpileup.h | 54 ++++++++++--- 3 files changed, 202 insertions(+), 42 deletions(-) diff --git a/mpileup2.c b/mpileup2.c index 1198877c6..4febc1432 100644 --- a/mpileup2.c +++ b/mpileup2.c @@ -1,6 +1,6 @@ /* mpileup2.c -- mpileup2, new version of mpileup - Copyright (C) 2022 Genome Research Ltd. + Copyright (C) 2022-2023 Genome Research Ltd. Author: Petr Danecek diff --git a/mpileup2/mpileup.c b/mpileup2/mpileup.c index 49744d169..a904077f8 100644 --- a/mpileup2/mpileup.c +++ b/mpileup2/mpileup.c @@ -1,6 +1,6 @@ /* mileup2.c -- mpileup v2.0 API - Copyright (C) 2022 Genome Research Ltd. + Copyright (C) 2022-2023 Genome Research Ltd. Author: Petr Danecek @@ -31,6 +31,8 @@ #include #include #include +//#include // fixme: itr_loop and itr->seq is not working for some reason +#include "regidx.h" #include #include "mpileup.h" #include "bam_sample.h" @@ -53,6 +55,7 @@ typedef struct } rb_node_t; +// Reads from a small window around the current position, per sample, not per BAM typedef struct { int tid; @@ -70,6 +73,7 @@ typedef struct { samFile *fp; bam_hdr_t *hdr; + hts_idx_t *idx; hts_itr_t *iter; char *fname; int bam_id; @@ -82,12 +86,15 @@ bam_aux_t; struct mpileup_t_ { - char *fasta_ref, *regions, *regions_fname, *targets, *targets_fname, *samples, *samples_fname, *read_groups_fname; + char *fasta_ref, *targets, *targets_fname, *samples, *samples_fname, *read_groups_fname; int smart_overlaps, smpl_ignore_rg, max_dp_per_sample, adjust_mq, min_mq, min_bq, max_bq, delta_bq; int min_realn_dp, max_realn_dp, max_realn_len, skip_any_unset, skip_all_unset, skip_any_set, skip_all_set; float min_realn_frac; int nbam, nbam_names; char **bam_names; + regidx_t *regions_idx; + regitr_t *regions_itr; + int nregions; bam_aux_t *bam; bam_hdr_t *hdr; bam_smpl_t *bsmpl; @@ -97,10 +104,13 @@ struct mpileup_t_ hts_pos_t pos; bam_pileup1_t **plp; int *nplp; + kstring_t tmp_str; }; static int mplp_add_bam(mpileup_t *mplp, char *bam, int is_file); +static int mplp_add_region(mpileup_t *mplp, char *regions, int is_file); static void read_buffer_clear(read_buffer_t *rbuf); +static void read_buffer_reset(read_buffer_t *rbuf); mpileup_t *mpileup_alloc(void) { @@ -148,18 +158,20 @@ int mpileup_set(mpileup_t *mplp, mpileup_opt_t key, ...) } case REGIONS: - free(mplp->regions); + { va_start(args, key); - mplp->regions = strdup(va_arg(args,char*)); + char *regions = va_arg(args,char*); va_end(args); - return mplp->regions ? 0 : -1; + return mplp_add_region(mplp,regions,0); + } case REGIONS_FNAME: - free(mplp->regions_fname); + { va_start(args, key); - mplp->regions_fname = strdup(va_arg(args,char*)); + char *regions = va_arg(args,char*); va_end(args); - return mplp->regions_fname ? 0 : -1; + return mplp_add_region(mplp,regions,1); + } case TARGETS: free(mplp->targets); @@ -294,7 +306,7 @@ int mpileup_set(mpileup_t *mplp, mpileup_opt_t key, ...) default: hts_log_error("Todo: mpileup_set key=%d",(int)key); - return 1; + return -1; break; } return 0; @@ -332,6 +344,8 @@ void mpileup_destroy(mpileup_t *mplp) { sam_close(mplp->bam[i].fp); if ( mplp->bam[i].cached_rec ) bam_destroy1(mplp->bam[i].cached_rec); + if ( mplp->bam[i].idx ) hts_idx_destroy(mplp->bam[i].idx); + if ( mplp->bam[i].iter ) hts_itr_destroy(mplp->bam[i].iter); } for (i=0; inbam_names; i++) free(mplp->bam_names[i]); free(mplp->bam); @@ -343,13 +357,14 @@ void mpileup_destroy(mpileup_t *mplp) bam_hdr_destroy(mplp->hdr); if ( mplp->bsmpl ) bam_smpl_destroy(mplp->bsmpl); free(mplp->fasta_ref); - free(mplp->regions); - free(mplp->regions_fname); free(mplp->targets); free(mplp->targets_fname); + if ( mplp->regions_idx ) regidx_destroy(mplp->regions_idx); + if ( mplp->regions_itr ) regitr_destroy(mplp->regions_itr); free(mplp->samples); free(mplp->samples_fname); free(mplp->read_groups_fname); + free(mplp->tmp_str.s); free(mplp); } @@ -359,6 +374,30 @@ static int is_url(const char *str) return str[strspn(str, uri_scheme_chars)] == ':'; } +static int mplp_add_region(mpileup_t *mplp, char *regions, int is_file) +{ + if ( mplp->regions_idx ) return -1; // regions can be added only once + if ( is_file ) + mplp->regions_idx = regidx_init(regions,NULL,NULL,0,NULL); + else + { + mplp->regions_idx = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL); + if ( !mplp->regions_idx || regidx_insert_list(mplp->regions_idx,regions,',')!=0 ) + { + hts_log_error("Could not initialize the regions: %s",regions); + return -1; + } + } + if ( !mplp->regions_idx ) + { + hts_log_error("Could not initialize the regions: %s",regions); + return -1; + } + mplp->nregions = regidx_nregs(mplp->regions_idx); + mplp->regions_itr = regitr_init(mplp->regions_idx); + regitr_loop(mplp->regions_itr); // region iterator now positioned at the first region + return 0; +} static int mplp_add_bam(mpileup_t *mplp, char *bam, int is_file) { struct stat sbuf; @@ -394,13 +433,13 @@ static int mplp_add_bam(mpileup_t *mplp, char *bam, int is_file) } for (i=0; ibam_names,(mplp->nbam_names+1)*sizeof(*mplp->bam_names)); if ( !tmp ) { hts_log_error("Could not allocate %zu bytes",(mplp->nbam_names+1)*sizeof(*mplp->bam_names)); - return 1; + return -1; } mplp->bam_names = tmp; mplp->bam_names[mplp->nbam_names++] = strdup(bam); @@ -422,19 +461,19 @@ int mpileup_init(mpileup_t *mplp) if ( mplp->samples && !bam_smpl_add_samples(mplp->bsmpl,mplp->samples,0) ) { hts_log_error("Could not parse samples: %s", mplp->samples); - return 1; + return -1; } if ( mplp->samples_fname && !bam_smpl_add_samples(mplp->bsmpl,mplp->samples_fname,0) ) { hts_log_error("Could not read samples: %s", mplp->samples_fname); - return 1; + return -1; } mplp->bam = (bam_aux_t*)calloc(mplp->nbam_names,sizeof(*mplp->bam)); if ( !mplp->bam ) { hts_log_error("Could not allocate %zu bytes",mplp->nbam_names*sizeof(*mplp->bam)); - return 1; + return -1; } int i; for (i=0; inbam_names; i++) @@ -447,17 +486,17 @@ int mpileup_init(mpileup_t *mplp) if ( !baux->fp ) { hts_log_error("Failed to open %s: %s",fname,strerror(errno)); - return 1; + return -1; } if ( hts_set_opt(baux->fp, CRAM_OPT_DECODE_MD, 1) ) { hts_log_error("Failed to set CRAM_OPT_DECODE_MD value: %s\n",fname); - return 1; + return -1; } if ( mplp->fasta_ref && hts_set_fai_filename(baux->fp,mplp->fasta_ref)!=0 ) { hts_log_error("Failed to process %s: %s\n",mplp->fasta_ref, strerror(errno)); - return 1; + return -1; } // baux->conf = conf; // baux->ref = &mp_ref; @@ -465,7 +504,7 @@ int mpileup_init(mpileup_t *mplp) if ( !hdr ) { hts_log_error("Failed to read the header of %s\n",fname); - return 1; + return -1; } baux->bam_id = bam_smpl_add_bam(mplp->bsmpl,hdr->text,fname); if ( baux->bam_id < 0 ) @@ -475,12 +514,43 @@ int mpileup_init(mpileup_t *mplp) bam_hdr_destroy(hdr); } mplp->nbam++; + + // iterators + if ( mplp->regions_idx ) + { + hts_idx_t *idx = sam_index_load(baux->fp,baux->fname); + if ( !idx ) + { + hts_log_error("Failed to load index: %s\n",fname); + return 1; + } + mplp->tmp_str.l = 0; + //ksprintf(&mplp->tmp_str,"%s:%"PRIhts_pos"-%"PRIhts_pos,mplp->regions_itr->seq,mplp->regions_itr->beg+1,mplp->regions_itr->end+1); + ksprintf(&mplp->tmp_str,"%s:%u-%u",mplp->regions_itr->seq,mplp->regions_itr->beg+1,mplp->regions_itr->end+1); + baux->iter = sam_itr_querys(idx, hdr, mplp->tmp_str.s); + if ( !baux->iter ) + { + baux->iter = sam_itr_querys(idx, hdr, mplp->regions_itr->seq); + if ( baux->iter ) + { + hts_log_error("[Failed to parse the region: %s",mplp->tmp_str.s); + return 1; + } + hts_log_error("The sequence \"%s\" not found: %s",mplp->regions_itr->seq,fname); + return 1; + } + if ( mplp->nregions==1 ) // no need to keep the index in memory + hts_idx_destroy(idx); + else + baux->idx = idx; + } + if ( !mplp->hdr ) mplp->hdr = hdr; else { int ret = mplp_check_header_contigs(mplp,hdr); bam_hdr_destroy(hdr); - if ( ret!=0 ) return 1; + if ( ret!=0 ) return -1; } baux->hdr = mplp->hdr; baux->tid = -1; @@ -489,7 +559,7 @@ int mpileup_init(mpileup_t *mplp) if ( !mplp->nsmpl ) { hts_log_error("Failed to initialize samples"); - return 1; + return -1; } mplp->read_buf = (read_buffer_t*) calloc(mplp->nsmpl,sizeof(*mplp->read_buf)); mplp->plp = (bam_pileup1_t**) calloc(mplp->nsmpl,sizeof(*mplp->plp)); @@ -498,7 +568,9 @@ int mpileup_init(mpileup_t *mplp) return 0; } -void read_buffer_clear(read_buffer_t *rbuf) + +// Clear the per-sample buffer, destroying all its data +static void read_buffer_clear(read_buffer_t *rbuf) { int i; for (i=0; im; i++) @@ -508,6 +580,18 @@ void read_buffer_clear(read_buffer_t *rbuf) free(rbuf->buf); memset(rbuf,0,sizeof(*rbuf)); } + +// Reset the buffer to be ready for the next region +static void read_buffer_reset(read_buffer_t *rbuf) +{ + int i; + for (i=0; im; i++) rbuf->buf[i].is_set = 0; + rbuf->n = 0; + rbuf->tid = 0; + rbuf->pos = 0; + rbuf->empty = 0; +} + /* * Returns * 0 .. on success (pushed onto the buffer, swapped rec for NULL or an empty record) @@ -515,7 +599,7 @@ void read_buffer_clear(read_buffer_t *rbuf) * 2 .. cannot push (past the current position) * -1 .. error */ -int read_buffer_push(read_buffer_t *rbuf, hts_pos_t pos, bam1_t **rec_ptr) +static int read_buffer_push(read_buffer_t *rbuf, hts_pos_t pos, bam1_t **rec_ptr) { static int warned = 0; if ( !warned ) hts_log_error("read_buffer_push - left softclip vs read start"); @@ -751,15 +835,10 @@ static int mplp_set_pileup(mpileup_t *mplp, read_buffer_t *rbuf) return 0; } -int mpileup_next(mpileup_t *mplp) +static int mplp_region_next(mpileup_t *mplp) { int i; - // if new region is needed - // - advance regitr_loop - // - reset read_buffers - // - reset mplp tid and pos - // fill buffers mplp->pos++; for (i=0; inbam; i++) @@ -781,8 +860,6 @@ int mpileup_next(mpileup_t *mplp) } } -fprintf(stderr,"tid,pos=%d %d\n",mplp->tid,(int)mplp->pos+1); - // prepare the pileup int nreads = 0; for (i=0; insmpl; i++) @@ -794,5 +871,52 @@ fprintf(stderr,"tid,pos=%d %d\n",mplp->tid,(int)mplp->pos+1); return nreads>0 /* || nregions */ ? 1 : 0; } +int mpileup_next(mpileup_t *mplp) +{ + // if new region is needed + // o advance regitr_loop + // - reset read_buffers + // o reset mplp tid and pos + + if ( !mplp->nregions ) + return mplp_region_next(mplp); + int i; + while (1) + { + int ret = mplp_region_next(mplp); + if ( ret>0 ) + { + if ( mplp->pos < mplp->regions_itr->beg ) continue; +if ( mplp->pos <= mplp->regions_itr->end ) fprintf(stderr,"mpileup_next: ret=%d tid,pos=%d %d\n",ret,mplp->tid,(int)mplp->pos+1); + if ( mplp->pos <= mplp->regions_itr->end ) return ret; + } + if ( !regitr_loop(mplp->regions_itr) ) return 0; + + for (i=0; insmpl; i++) read_buffer_reset(&mplp->read_buf[i]); + + mplp->tmp_str.l = 0; + //ksprintf(&mplp->tmp_str,"%s:%"PRIhts_pos"-%"PRIhts_pos,mplp->regions_itr->seq,mplp->regions_itr->beg+1,mplp->regions_itr->end+1); + ksprintf(&mplp->tmp_str,"%s:%u-%u",mplp->regions_itr->seq,mplp->regions_itr->beg+1,mplp->regions_itr->end+1); + for (i=0; inbam; i++) + { + bam_aux_t *baux = &mplp->bam[i]; + hts_itr_destroy(baux->iter); + baux->iter = sam_itr_querys(baux->idx, baux->hdr, mplp->tmp_str.s); + if ( !baux->iter ) + { + baux->iter = sam_itr_querys(baux->idx, baux->hdr, mplp->regions_itr->seq); + if ( baux->iter ) + { + hts_log_error("[Failed to parse the region: %s",mplp->tmp_str.s); + return 1; + } + hts_log_error("The sequence \"%s\" not found: %s",mplp->regions_itr->seq,baux->fname); + return -1; + } + baux->tid = baux->pos = -1; + } + mplp->tid = mplp->pos = -1; + } +} diff --git a/mpileup2/mpileup.h b/mpileup2/mpileup.h index 164484b48..a30b271a8 100644 --- a/mpileup2/mpileup.h +++ b/mpileup2/mpileup.h @@ -1,6 +1,6 @@ /* mileup2.h -- mpileup v2.0 API - Copyright (C) 2022 Genome Research Ltd. + Copyright (C) 2022-2023 Genome Research Ltd. Author: Petr Danecek @@ -29,13 +29,12 @@ typedef struct mpileup_t_ mpileup_t; -// Various options. For keys marked as 'can fail' the return status should be checked as these can fail typedef enum { // required FASTA_REF, // char*, fasta reference - BAM, // char*, add another BAM/CRAM. Can fail. - BAM_FNAME, // char*, read list of BAMs/CRAMs from a file. Can fail. + BAM, // char*, add another BAM/CRAM + BAM_FNAME, // char*, read list of BAMs/CRAMs from a file // optional REGIONS, // char*, comma-separated regions @@ -68,13 +67,50 @@ typedef enum } mpileup_opt_t; -#define mpileup_get_opt(mplp,type,key) (*(type*)mpileup_get(mplp, key)) - +/** + * mpileup_alloc() - allocate the mpileup structure + * + * Note that any default values set by the function can change without a warning and + * therefore should be set explicitly with `mpileup_set()` by the caller. + */ mpileup_t *mpileup_alloc(void); + +/** + * mpileup_destroy - destroy the mpileup structure + */ +void mpileup_destroy(mpileup_t *mplp); + +/** + * mpileup_set() - set various options, see the mpileup_opt_t keys for the complete list + * + * Returns 0 if the call succeeded, or negative number on error. + */ int mpileup_set(mpileup_t *mplp, mpileup_opt_t key, ...); // returns 0 on success + +/** + * mpileup_get() - get various options, see the mpileup_opt_t keys + * mpileup_get_opt() - wrapper for `mpileup_get()` to return typed value + * + * The former returns pointer to the memory area populated by the requested setting, + * its type can be inferred from the mpileup_opt_t documentation. + */ void *mpileup_get(mpileup_t *mplp, mpileup_opt_t key); -int mpileup_init(mpileup_t *mplp); // inits regions, bams, iterators, returns 0 on success -int mpileup_next(mpileup_t *mplp); // returns 1 on next position, 0 when done, negative value on error -void mpileup_destroy(mpileup_t *mplp); +#define mpileup_get_opt(mplp,type,key) (*(type*)mpileup_get(mplp, key)) + +/** + * mpileup_init() - inits regions, bams, iterators, etc. + * + * Should be called after fully set up, i.e. after all `mpileup_set()` calls. + * + * Returns 0 on success or negative number on error. + */ +int mpileup_init(mpileup_t *mplp); + +/** + * mpileup_next() - positions mpileup to the next genomic position + * + * Returns 1 when next position is available, 0 when all regions are done, negative value on error + */ +int mpileup_next(mpileup_t *mplp); #endif From 922a5f1a518d75c5f6bc7ffa8533b68a1f3f6d0c Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 11 Jul 2023 11:32:39 +0200 Subject: [PATCH 6/6] backup commit --- mpileup2.c | 39 +++++++++++++++++++++++++-------------- mpileup2/mpileup.c | 11 +++++++---- mpileup2/mpileup.h | 9 +++++++-- 3 files changed, 39 insertions(+), 20 deletions(-) diff --git a/mpileup2.c b/mpileup2.c index 4febc1432..a53106233 100644 --- a/mpileup2.c +++ b/mpileup2.c @@ -72,9 +72,20 @@ args_t; static int run_mpileup(args_t *args) { - int ret; + int ret, nsmpl = mpileup_get_val(args->mplp,int,N_SAMPLES); while ( (ret=mpileup_next(args->mplp))==1 ) { +xxxx hts_pos_t pos = mpileup_get_val(args->mplp,int,PLP_POS); + int *nreads = mpileup_get_val(args->mplp,int,N_READS); + char *bases = mpileup_get_val(args->mplp,char,BASES); + int i,j,k=0; + for (i=0; implp, int, SKIP_ALL_SET)); - char *tmp_skip_any_unset = bam_flag2str(mpileup_get_opt(args->mplp, int, SKIP_ANY_UNSET)); - char *tmp_skip_all_unset = bam_flag2str(mpileup_get_opt(args->mplp, int, SKIP_ALL_UNSET)); - char *tmp_skip_any_set = bam_flag2str(mpileup_get_opt(args->mplp, int, SKIP_ANY_SET)); + char *tmp_skip_all_set = bam_flag2str(mpileup_get_val(args->mplp, int, SKIP_ALL_SET)); + char *tmp_skip_any_unset = bam_flag2str(mpileup_get_val(args->mplp, int, SKIP_ANY_UNSET)); + char *tmp_skip_all_unset = bam_flag2str(mpileup_get_val(args->mplp, int, SKIP_ALL_UNSET)); + char *tmp_skip_any_set = bam_flag2str(mpileup_get_val(args->mplp, int, SKIP_ANY_SET)); fprintf(fp, "\n" @@ -205,19 +216,19 @@ static void print_usage(args_t *args, FILE *fp) " -B, --no-BAQ Disable BAQ (per-Base Alignment Quality)\n" " -C, --adjust-MQ INT Adjust mapping quality [0]\n" " -D, --full-BAQ Apply BAQ everywhere, not just in problematic regions\n" - " -d, --max-depth INT Max raw per-sample depth; avoids excessive memory usage [%d]\n", mpileup_get_opt(args->mplp, int, MAX_DP_PER_SAMPLE)); + " -d, --max-depth INT Max raw per-sample depth; avoids excessive memory usage [%d]\n", mpileup_get_val(args->mplp, int, MAX_DP_PER_SAMPLE)); fprintf(fp, " -E, --redo-BAQ Recalculate BAQ on the fly, ignore existing BQs\n" " -f, --fasta-ref FILE Faidx indexed reference sequence file\n" " --no-reference Do not require fasta reference file\n" " -G, --read-groups FILE Select or exclude read groups listed in the file\n" - " -q, --min-MQ INT Skip alignments with mapQ smaller than INT [%d]\n", mpileup_get_opt(args->mplp, int, MIN_MQ)); + " -q, --min-MQ INT Skip alignments with mapQ smaller than INT [%d]\n", mpileup_get_val(args->mplp, int, MIN_MQ)); fprintf(fp, - " -Q, --min-BQ INT Skip bases with baseQ/BAQ smaller than INT [%d]\n", mpileup_get_opt(args->mplp, int, MIN_BQ)); + " -Q, --min-BQ INT Skip bases with baseQ/BAQ smaller than INT [%d]\n", mpileup_get_val(args->mplp, int, MIN_BQ)); fprintf(fp, - " --max-BQ INT Limit baseQ/BAQ to no more than INT [%d]\n", mpileup_get_opt(args->mplp, int, MAX_BQ)); + " --max-BQ INT Limit baseQ/BAQ to no more than INT [%d]\n", mpileup_get_val(args->mplp, int, MAX_BQ)); fprintf(fp, - " --delta-BQ INT Use neighbour_qual + INT if less than qual [%d]\n", mpileup_get_opt(args->mplp, int, DELTA_BQ)); + " --delta-BQ INT Use neighbour_qual + INT if less than qual [%d]\n", mpileup_get_val(args->mplp, int, DELTA_BQ)); fprintf(fp, " -r, --regions REG[,...] Comma separated list of regions in which pileup is generated\n" " -R, --regions-file FILE Restrict to regions listed in a file\n" @@ -250,16 +261,16 @@ static void print_usage(args_t *args, FILE *fp) " -X, --config STR Specify platform specific profiles (see below)\n" " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", args->bca.extQ); fprintf(fp, - " -F, --gap-frac FLOAT Minimum fraction of gapped reads [%f]\n", mpileup_get_opt(args->mplp, float, MIN_REALN_FRAC)); + " -F, --gap-frac FLOAT Minimum fraction of gapped reads [%f]\n", mpileup_get_val(args->mplp, float, MIN_REALN_FRAC)); fprintf(fp, " -h, --tandem-qual INT Coefficient for homopolymer errors [%d]\n", args->bca.tandemQ); fprintf(fp, " -I, --skip-indels Do not perform indel calling\n" - " -L, --max-idepth INT Subsample to maximum per-sample depth for INDEL calling [%d]\n", mpileup_get_opt(args->mplp, int, MAX_REALN_DP)); + " -L, --max-idepth INT Subsample to maximum per-sample depth for INDEL calling [%d]\n", mpileup_get_val(args->mplp, int, MAX_REALN_DP)); fprintf(fp, - " -m, --min-ireads INT Minimum number gapped reads for indel candidates [%d]\n", mpileup_get_opt(args->mplp, int, MIN_REALN_DP)); + " -m, --min-ireads INT Minimum number gapped reads for indel candidates [%d]\n", mpileup_get_val(args->mplp, int, MIN_REALN_DP)); fprintf(fp, - " -M, --max-read-len INT Maximum length of read to pass to BAQ algorithm [%d]\n", mpileup_get_opt(args->mplp, int, MAX_REALN_LEN)); + " -M, --max-read-len INT Maximum length of read to pass to BAQ algorithm [%d]\n", mpileup_get_val(args->mplp, int, MAX_REALN_LEN)); fprintf(fp, " -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", args->bca.openQ); fprintf(fp, diff --git a/mpileup2/mpileup.c b/mpileup2/mpileup.c index a904077f8..afef6cf8e 100644 --- a/mpileup2/mpileup.c +++ b/mpileup2/mpileup.c @@ -786,6 +786,7 @@ static int cigar_get_pos(cigar_state_t *cs, bam1_t *bam, hts_pos_t pos, int32_t return -1; } +// Go through all reads in the buffer and make the pileup structures ready for output static int mplp_set_pileup(mpileup_t *mplp, read_buffer_t *rbuf) { if ( !rbuf->n ) return 0; @@ -793,7 +794,7 @@ static int mplp_set_pileup(mpileup_t *mplp, read_buffer_t *rbuf) int iprev = -1, i = rbuf->head; while ( 1 ) { - rb_node_t *node = &rbuf->buf[i]; + rb_node_t *node = &rbuf->buf[i]; // rb_node_t keeps info about a single read int32_t ipos; int ret = cigar_get_pos(&node->cstate, node->bam, mplp->pos, &ipos); if ( ret==BAM_CMATCH ) // there is a base, match or mismatch @@ -839,8 +840,10 @@ static int mplp_region_next(mpileup_t *mplp) { int i; - // fill buffers mplp->pos++; + if ( mplp->nregions && mplp->pos > mplp->regions_itr->end ) return 0; + + // fill buffers for (i=0; inbam; i++) { int ret = mplp_read_bam(mplp, &mplp->bam[i]); @@ -858,6 +861,7 @@ static int mplp_region_next(mpileup_t *mplp) if ( mplp->tid > mplp->read_buf[i].tid ) mplp->tid = mplp->read_buf[i].tid, mplp->pos = HTS_POS_MAX; if ( mplp->pos > mplp->read_buf[i].pos ) mplp->pos = mplp->read_buf[i].pos; } + if ( mplp->nregions && mplp->pos < mplp->regions_itr->beg ) mplp->pos = mplp->regions_itr->beg; } // prepare the pileup @@ -875,7 +879,7 @@ int mpileup_next(mpileup_t *mplp) { // if new region is needed // o advance regitr_loop - // - reset read_buffers + // o reset read_buffers // o reset mplp tid and pos if ( !mplp->nregions ) @@ -888,7 +892,6 @@ int mpileup_next(mpileup_t *mplp) if ( ret>0 ) { if ( mplp->pos < mplp->regions_itr->beg ) continue; -if ( mplp->pos <= mplp->regions_itr->end ) fprintf(stderr,"mpileup_next: ret=%d tid,pos=%d %d\n",ret,mplp->tid,(int)mplp->pos+1); if ( mplp->pos <= mplp->regions_itr->end ) return ret; } if ( !regitr_loop(mplp->regions_itr) ) return 0; diff --git a/mpileup2/mpileup.h b/mpileup2/mpileup.h index a30b271a8..7b6d66c25 100644 --- a/mpileup2/mpileup.h +++ b/mpileup2/mpileup.h @@ -64,6 +64,11 @@ typedef enum MIN_REALN_DP, // int, minimum number of reads with evidence for an indel to consider the site needing realignment [2] MAX_REALN_DP, // int, subsample to this many reads for realignment [250] MAX_REALN_LEN, // int, realign reads this long max [500] + + // pileup related values + N_SAMPLES, // int, number of samples in the pileup + N_READS, // int*, number of reads per sample at the current position + PILEUP, // } mpileup_opt_t; @@ -89,13 +94,13 @@ int mpileup_set(mpileup_t *mplp, mpileup_opt_t key, ...); // returns 0 on succ /** * mpileup_get() - get various options, see the mpileup_opt_t keys - * mpileup_get_opt() - wrapper for `mpileup_get()` to return typed value + * mpileup_get_val() - wrapper for `mpileup_get()` to return typed value * * The former returns pointer to the memory area populated by the requested setting, * its type can be inferred from the mpileup_opt_t documentation. */ void *mpileup_get(mpileup_t *mplp, mpileup_opt_t key); -#define mpileup_get_opt(mplp,type,key) (*(type*)mpileup_get(mplp, key)) +#define mpileup_get_val(mplp,type,key) (*(type*)mpileup_get(mplp, key)) /** * mpileup_init() - inits regions, bams, iterators, etc.