From cfd4dc6e54414472b43c71b16d2eb6b4ff9f835b Mon Sep 17 00:00:00 2001 From: Romaric JODIN Date: Tue, 9 Feb 2021 16:10:51 +0100 Subject: [PATCH 01/48] cloud6 config --- common/inc/common.h | 2 +- host/inc/upvc.h | 4 ++-- host/src/dpu_backend.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/common/inc/common.h b/common/inc/common.h index 637403a..686ccb4 100644 --- a/common/inc/common.h +++ b/common/inc/common.h @@ -18,7 +18,7 @@ #define MAX_DPU_RESULTS (1 << 19) #define MAX_RESULTS_PER_READ (1 << 10) -#define SIZE_READ 120 +#define SIZE_READ 150 #define SIZE_SEED 14 #define SIZE_NEIGHBOUR_IN_BYTES ((SIZE_READ - SIZE_SEED) / 4) #define DELTA_NEIGHBOUR(round) ((SIZE_SEED * round) / 4) diff --git a/host/inc/upvc.h b/host/inc/upvc.h index 544a6b6..1dfa9f2 100644 --- a/host/inc/upvc.h +++ b/host/inc/upvc.h @@ -7,8 +7,8 @@ #define VERSION "VERSION 1.8" #define MAX_READS_BUFFER (512 * 1024) /* Maximum number of read by round */ -#define NB_READS_BUFFER (16) /* To be increase if enough legacy memory available */ -#define NB_DISPATCH_AND_ACC_BUFFER (4) /* To be increase if enough legacy memory available */ +#define NB_READS_BUFFER (128) /* To be increase if enough legacy memory available */ +#define NB_DISPATCH_AND_ACC_BUFFER (32) /* To be increase if enough legacy memory available */ #define NB_ROUND (1) #define COST_SUB 10 diff --git a/host/src/dpu_backend.c b/host/src/dpu_backend.c index 4db32e2..c1176e1 100644 --- a/host/src/dpu_backend.c +++ b/host/src/dpu_backend.c @@ -251,7 +251,7 @@ void run_on_dpu(unsigned int dpu_offset, unsigned int pass_id, sem_t *dispatch_f void init_backend_dpu(unsigned int *nb_dpus_per_run) { - const char *profile = "cycleAccurate=true,nrJobsPerRank=64"; + const char *profile = "cycleAccurate=true,nrJobsPerRank=64,dispatchOnAllRanks=true"; DPU_ASSERT(dpu_alloc(get_nb_dpu(), profile, &devices.all_ranks)); DPU_ASSERT(dpu_load_from_incbin(devices.all_ranks, &upvc_dpu_program, NULL)); From 24851c40b1013dbc96849c03bdfd2419dc0ce398 Mon Sep 17 00:00:00 2001 From: Julien Legriel Date: Thu, 25 Feb 2021 14:22:33 +0100 Subject: [PATCH 02/48] Initial implem --- host/inc/genome.h | 10 +++++---- host/src/genome.c | 15 +++++++++++++ host/src/processread.c | 43 ++++++++++++++++++++----------------- host/src/upvc.c | 1 + host/src/vartree.c | 48 ++++++++++++++++++++++++++++++++++++++---- 5 files changed, 90 insertions(+), 27 deletions(-) diff --git a/host/inc/genome.h b/host/inc/genome.h index eb5bbe6..fed2a94 100644 --- a/host/inc/genome.h +++ b/host/inc/genome.h @@ -13,12 +13,12 @@ typedef struct { uint32_t magic; uint32_t version; - uint32_t nb_seq; - uint64_t pt_seq[MAX_SEQ_GEN]; - uint64_t len_seq[MAX_SEQ_GEN]; + uint32_t nb_seq; // nb of chromosomes (24) + uint64_t pt_seq[MAX_SEQ_GEN]; // offset of each chromosome in data + uint64_t len_seq[MAX_SEQ_GEN]; // length of chromosome in data uint64_t fasta_file_size; char seq_name[MAX_SEQ_GEN][MAX_SEQ_NAME_SIZE]; - int8_t *data; + int8_t *data; //genome de reference 1B = 1 nucleotide int32_t *mapping_coverage; } genome_t; @@ -30,4 +30,6 @@ void genome_free(); genome_t *genome_get(); +float** get_frequency_table(); + #endif /* __GENOME_H__ */ diff --git a/host/src/genome.c b/host/src/genome.c index 92b561a..fe89e4f 100644 --- a/host/src/genome.c +++ b/host/src/genome.c @@ -114,3 +114,18 @@ void genome_free() free(genome.data); free(genome.mapping_coverage); } + +static float* frequency_table[5]; + +float** get_frequency_table() { + + if(!frequency_table[0]) { + // allocate frequency_table on first call + for(int i = 0; i < 5; ++i) { + frequency_table[i] = (float*)calloc(genome.fasta_file_size, sizeof(float)); + } + } + return frequency_table; +} + + diff --git a/host/src/processread.c b/host/src/processread.c index f22603d..b2b8d0e 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -202,6 +202,7 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy * The code is return in "code" as a table of int8_t */ +#if 0 static int code_alignment(uint8_t *code, int score, int8_t *gen, int8_t *read, unsigned size_neighbour_in_symbols) { int code_idx, computed_score, backtrack_idx; @@ -377,6 +378,7 @@ static void set_variant( newvar, result_match.coord.seq_nr, pos_variant_genome + 1 - ref_genome->pt_seq[result_match.coord.seq_nr]); } } +#endif static pthread_mutex_t non_mapped_mutex; static void add_to_non_mapped_read(int numread, int round, FILE *fpe1, FILE *fpe2, int8_t *reads_buffer) @@ -407,6 +409,20 @@ static void add_to_non_mapped_read(int numread, int round, FILE *fpe1, FILE *fpe pthread_mutex_unlock(&non_mapped_mutex); } +static void update_frequency_table( + genome_t *ref_genome, + dpu_result_out_t *result_tab, + int8_t *reads_buffer, + int pos) { + + float **frequency_table = get_frequency_table(); + uint64_t genome_pos = ref_genome->pt_seq[result_tab[pos].coord.seq_nr] + result_tab[pos].coord.seed_nr; + int num = result_tab[pos].num; + int8_t *read = reads_buffer + (num * SIZE_READ); + for(int j = 0; j < SIZE_READ; ++j) + frequency_table[read[j]][genome_pos]++; +} + static volatile unsigned int curr_match; static pthread_mutex_t curr_match_mutex; unsigned int acquire_curr_match() @@ -446,7 +462,7 @@ static void do_process_read(process_read_arg_t *arg) genome_t *ref_genome = arg->ref_genome; FILE *fpe1 = arg->fpe1; FILE *fpe2 = arg->fpe2; - unsigned int size_neighbour_in_symbols = (SIZE_NEIGHBOUR_IN_BYTES - DELTA_NEIGHBOUR(round)) * 4; + //unsigned int size_neighbour_in_symbols = (SIZE_NEIGHBOUR_IN_BYTES - DELTA_NEIGHBOUR(round)) * 4; /* * The number of a pair is given by "num_read / 4 " (see dispatch_read function) @@ -512,25 +528,14 @@ static void do_process_read(process_read_arg_t *arg) } } if (np > 0) { - // choose the less covered zone on the genome - int x = 0; - uint64_t genome_pos; - int cov1, cov2; - int min_cov = 1000; - ; - for (unsigned int kk = 0; kk < np; kk++) { - genome_pos = ref_genome->pt_seq[result_tab[P1[kk]].coord.seq_nr] + result_tab[P1[kk]].coord.seed_nr; - cov1 = ref_genome->mapping_coverage[genome_pos]; - genome_pos = ref_genome->pt_seq[result_tab[P2[kk]].coord.seq_nr] + result_tab[P2[kk]].coord.seed_nr; - cov2 = ref_genome->mapping_coverage[genome_pos]; - if (cov1 + cov2 < min_cov) { - x = kk; - min_cov = cov1 + cov2; - } - } - set_variant(result_tab[P1[x]], ref_genome, reads_buffer, size_neighbour_in_symbols); - set_variant(result_tab[P2[x]], ref_genome, reads_buffer, size_neighbour_in_symbols); + // update frequency table + for (unsigned int i = 0; i < np; i++) { + + // for each matching position of this read in the reference genome, add +1 in the corresponding nucleotide column + update_frequency_table(ref_genome, result_tab, reads_buffer, P1[i]); + update_frequency_table(ref_genome, result_tab, reads_buffer, P2[i]); + } } else { pthread_mutex_lock(&nr_reads_mutex); nr_reads_non_mapped++; diff --git a/host/src/upvc.c b/host/src/upvc.c index e924b48..e303bdb 100644 --- a/host/src/upvc.c +++ b/host/src/upvc.c @@ -172,6 +172,7 @@ static void exec_round() fipe1 = fopen(filename, "r"); CHECK_FILE(fipe1, filename); assert(get_input_info(fipe1, &read_size1, &nb_read1) == 0); + printf("read_size1 %zu SIZE_READ %u\n", read_size1, SIZE_READ); assert(read_size1 == SIZE_READ); sprintf(filename, "%s_PE2.fastq", input_prefix); diff --git a/host/src/vartree.c b/host/src/vartree.c index 0fd3d25..5b18fa1 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -199,6 +199,38 @@ static bool print_variant_tree(variant_t *var, uint32_t seq_nr, uint64_t seq_pos return true; } +static variant_t * get_most_frequent_variant(genome_t * ref_genome, float ** frequency_table, uint64_t genome_pos) { + + static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; + + float max = 0; + int8_t nucId = -1; + for(int i = 0; i < 5; ++i) { + float freq = frequency_table[i][genome_pos]++; + if(freq > max) { + max = freq; + nucId = i; + } + } + + if(nucId >= 0 && (nucId != ref_genome->data[genome_pos])) { + + assert(nucId < 4); + + // this is a substitution, create variant + variant_t *var = (variant_t *)malloc(sizeof(variant_t)); + var->score = max; + var->depth = 1; // TODO + var->ref[0] = nucleotide[ref_genome->data[genome_pos]]; + var->alt[0] = nucleotide[nucId]; + + return var; + } + + return NULL; +} + +//TODO here read frequency table and write vcf (take max of frequency table to find substitution if any) void create_vcf() { double start_time = my_clock(); @@ -235,15 +267,23 @@ void create_vcf() /* ####### END OF HEADER ####### */ + float **frequency_table = get_frequency_table(); + /* for each sequence in the genome */ for (uint32_t seq_number = 0; seq_number < ref_genome->nb_seq; seq_number++) { /* for each position in the sequence */ for (uint64_t seq_position = 0; seq_position < ref_genome->len_seq[seq_number]; seq_position++) { - variant_t *var = variant_list[seq_number][seq_position]; - while (var != NULL) { - nb_variant += print_variant_tree(var, seq_number, seq_position, ref_genome, vcf_file) ? 1 : 0; - var = var->next; + + uint64_t genome_pos = ref_genome->pt_seq[seq_number] + seq_position; + variant_t *var = get_most_frequent_variant(ref_genome, frequency_table, genome_pos); + if(var) { + nb_variant += print_variant_tree(var, seq_number, seq_position, ref_genome, vcf_file) ? 1 : 0; } + //variant_t *var = variant_list[seq_number][seq_position]; + //while (var != NULL) { + // nb_variant += print_variant_tree(var, seq_number, seq_position, ref_genome, vcf_file) ? 1 : 0; + // var = var->next; + //} } } From 45ddb349a5c9c1bda7df7cf4577f221d5c5b8ce0 Mon Sep 17 00:00:00 2001 From: Julien Legriel Date: Tue, 2 Mar 2021 13:33:02 +0100 Subject: [PATCH 03/48] 98% tp and 37% cm --- common/inc/common.h | 2 +- dpu/src/task.c | 14 +++++++------- host/inc/genome.h | 7 ++++++- host/src/genome.c | 11 +++++++---- host/src/processread.c | 14 ++++++++++---- host/src/vartree.c | 32 ++++++++++++++++++++------------ 6 files changed, 51 insertions(+), 29 deletions(-) diff --git a/common/inc/common.h b/common/inc/common.h index 686ccb4..637403a 100644 --- a/common/inc/common.h +++ b/common/inc/common.h @@ -18,7 +18,7 @@ #define MAX_DPU_RESULTS (1 << 19) #define MAX_RESULTS_PER_READ (1 << 10) -#define SIZE_READ 150 +#define SIZE_READ 120 #define SIZE_SEED 14 #define SIZE_NEIGHBOUR_IN_BYTES ((SIZE_READ - SIZE_SEED) / 4) #define DELTA_NEIGHBOUR(round) ((SIZE_SEED * round) / 4) diff --git a/dpu/src/task.c b/dpu/src/task.c index cae848e..b561666 100644 --- a/dpu/src/task.c +++ b/dpu/src/task.c @@ -121,15 +121,15 @@ static void compare_neighbours(sysname_t tasklet_id, uint32_t *mini, coords_and_ STATS_STORE_NODP_TIME(tasklet_stats, (end + acc - start)); STATS_INCR_NB_NODP_CALLS(*tasklet_stats); - if (score_nodp == UINT_MAX) { - STATS_GET_START_TIME(start, acc, end); + //if (score_nodp == UINT_MAX) { + // STATS_GET_START_TIME(start, acc, end); - score_odpd = score = odpd(current_read_nbr, ref_nbr, *mini, NB_BYTES_TO_SYMS(SIZE_NEIGHBOUR_IN_BYTES, DPU_MRAM_INFO_VAR)); + // score_odpd = score = odpd(current_read_nbr, ref_nbr, *mini, NB_BYTES_TO_SYMS(SIZE_NEIGHBOUR_IN_BYTES, DPU_MRAM_INFO_VAR)); - STATS_GET_END_TIME(end, acc); - STATS_STORE_ODPD_TIME(tasklet_stats, (end + acc - start)); - STATS_INCR_NB_ODPD_CALLS(*tasklet_stats); - } + // STATS_GET_END_TIME(end, acc); + // STATS_STORE_ODPD_TIME(tasklet_stats, (end + acc - start)); + // STATS_INCR_NB_ODPD_CALLS(*tasklet_stats); + //} if (score > *mini) { return; diff --git a/host/inc/genome.h b/host/inc/genome.h index fed2a94..14b2fe1 100644 --- a/host/inc/genome.h +++ b/host/inc/genome.h @@ -30,6 +30,11 @@ void genome_free(); genome_t *genome_get(); -float** get_frequency_table(); +struct frequency_info { + + float freq; + unsigned int score; +}; +struct frequency_info** get_frequency_table(); #endif /* __GENOME_H__ */ diff --git a/host/src/genome.c b/host/src/genome.c index fe89e4f..48b2d12 100644 --- a/host/src/genome.c +++ b/host/src/genome.c @@ -115,15 +115,18 @@ void genome_free() free(genome.mapping_coverage); } -static float* frequency_table[5]; +//TODO free function +static struct frequency_info* frequency_table[5]; +static bool init_frequency_table = false; -float** get_frequency_table() { +struct frequency_info** get_frequency_table() { - if(!frequency_table[0]) { + if(!init_frequency_table) { // allocate frequency_table on first call for(int i = 0; i < 5; ++i) { - frequency_table[i] = (float*)calloc(genome.fasta_file_size, sizeof(float)); + frequency_table[i] = (struct frequency_info*)calloc(genome.fasta_file_size, sizeof(struct frequency_info)); } + init_frequency_table = true; } return frequency_table; } diff --git a/host/src/processread.c b/host/src/processread.c index b2b8d0e..20b4626 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -415,12 +415,18 @@ static void update_frequency_table( int8_t *reads_buffer, int pos) { - float **frequency_table = get_frequency_table(); - uint64_t genome_pos = ref_genome->pt_seq[result_tab[pos].coord.seq_nr] + result_tab[pos].coord.seed_nr; + struct frequency_info **frequency_table = get_frequency_table(); + uint64_t genome_pos = ref_genome->pt_seq[result_tab[pos].coord.seq_nr] + result_tab[pos].coord.seed_nr; int num = result_tab[pos].num; int8_t *read = reads_buffer + (num * SIZE_READ); - for(int j = 0; j < SIZE_READ; ++j) - frequency_table[read[j]][genome_pos]++; + for(int j = 0; j < SIZE_READ; ++j) { + if(genome_pos + j < genome_get()->fasta_file_size) { + frequency_table[read[j]][genome_pos+j].freq++; + frequency_table[read[j]][genome_pos+j].score += result_tab[pos].score; + } + else + printf("WARNING: reads matched at position that exceeds genome size\n"); + } } static volatile unsigned int curr_match; diff --git a/host/src/vartree.c b/host/src/vartree.c index 5b18fa1..617e472 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -173,14 +173,17 @@ static bool print_variant_tree(variant_t *var, uint32_t seq_nr, uint64_t seq_pos goto print; if (ref_len == alt_len) { /* SUBSTITUTION */ - if (depth < 3) { - return false; - } else if (depth > 20) { + //if (depth < 3) { + // return false; + //} else if (depth > 20) { + // depth = 20; + //} + //if (!(score <= sub_filter[depth].score && percentage >= sub_filter[depth].percentage)) { + // return false; + //} + if (depth > 20) { depth = 20; } - if (!(score <= sub_filter[depth].score && percentage >= sub_filter[depth].percentage)) { - return false; - } } else { /* INSERTION OR DELETION */ if (depth < 2) { return false; @@ -193,25 +196,27 @@ static bool print_variant_tree(variant_t *var, uint32_t seq_nr, uint64_t seq_pos } print: - fprintf(vcf_file, "%s\t%lu\t.\t%s\t%s\t.\t.\tDEPTH=%d;COV=%d;SCORE=%d\n", chr, seq_pos, var->ref, var->alt, var->depth, cov, + //TODO + fprintf(vcf_file, "%s\t%lu\t.\t%s\t%s\t.\t.\tDEPTH=%d;COV=%d;SCORE=%d\n", chr, seq_pos+1, var->ref, var->alt, var->depth, cov, score); return true; } -static variant_t * get_most_frequent_variant(genome_t * ref_genome, float ** frequency_table, uint64_t genome_pos) { +static variant_t * get_most_frequent_variant(genome_t * ref_genome, struct frequency_info ** frequency_table, uint64_t genome_pos) { static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; float max = 0; int8_t nucId = -1; for(int i = 0; i < 5; ++i) { - float freq = frequency_table[i][genome_pos]++; + float freq = frequency_table[i][genome_pos].freq; if(freq > max) { max = freq; nucId = i; } } + //printf("get_most_frequent_variant: genome_pos %lu, nucleotide max freq %d %f %c\n", genome_pos, nucId, max, nucId >= 0 ? nucleotide[nucId] : '-'); if(nucId >= 0 && (nucId != ref_genome->data[genome_pos])) { @@ -219,10 +224,13 @@ static variant_t * get_most_frequent_variant(genome_t * ref_genome, float ** fre // this is a substitution, create variant variant_t *var = (variant_t *)malloc(sizeof(variant_t)); - var->score = max; - var->depth = 1; // TODO + var->score = frequency_table[nucId][genome_pos].score; + // TODO: at the moment the number of matches and the score is the same. If we start having weights, we should store both the count and score in frequency table + var->depth = max; var->ref[0] = nucleotide[ref_genome->data[genome_pos]]; + var->ref[1] = '\0'; var->alt[0] = nucleotide[nucId]; + var->alt[1] = '\0'; return var; } @@ -267,7 +275,7 @@ void create_vcf() /* ####### END OF HEADER ####### */ - float **frequency_table = get_frequency_table(); + struct frequency_info **frequency_table = get_frequency_table(); /* for each sequence in the genome */ for (uint32_t seq_number = 0; seq_number < ref_genome->nb_seq; seq_number++) { From 6b4cbb242272f2b4bef8cbc4642d47b12837c930 Mon Sep 17 00:00:00 2001 From: Julien Legriel Date: Tue, 2 Mar 2021 15:09:49 +0100 Subject: [PATCH 04/48] Implement changes explained by Bertil: variant if frequency > 20% --- common/inc/common.h | 2 +- host/src/vartree.c | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/common/inc/common.h b/common/inc/common.h index 637403a..686ccb4 100644 --- a/common/inc/common.h +++ b/common/inc/common.h @@ -18,7 +18,7 @@ #define MAX_DPU_RESULTS (1 << 19) #define MAX_RESULTS_PER_READ (1 << 10) -#define SIZE_READ 120 +#define SIZE_READ 150 #define SIZE_SEED 14 #define SIZE_NEIGHBOUR_IN_BYTES ((SIZE_READ - SIZE_SEED) / 4) #define DELTA_NEIGHBOUR(round) ((SIZE_SEED * round) / 4) diff --git a/host/src/vartree.c b/host/src/vartree.c index 617e472..0680788 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -207,18 +207,26 @@ static variant_t * get_most_frequent_variant(genome_t * ref_genome, struct frequ static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; + float total = 0; + for(int i = 0; i < 5; ++i) { + total += frequency_table[i][genome_pos].freq; + } + float max = 0; int8_t nucId = -1; for(int i = 0; i < 5; ++i) { - float freq = frequency_table[i][genome_pos].freq; - if(freq > max) { - max = freq; - nucId = i; + float freq = frequency_table[i][genome_pos].freq; + if(i == ref_genome->data[genome_pos]) continue; // not a variant if the same nucleotide as in reference genome + if((freq / total > 0.2) && freq > 3) { // if frequency > 20% and depth > 3, consider it a variant + if(freq > max) { // keep variant of max frequency + max = freq; + nucId = i; + } } } //printf("get_most_frequent_variant: genome_pos %lu, nucleotide max freq %d %f %c\n", genome_pos, nucId, max, nucId >= 0 ? nucleotide[nucId] : '-'); - if(nucId >= 0 && (nucId != ref_genome->data[genome_pos])) { + if(nucId >= 0) { assert(nucId < 4); @@ -226,7 +234,7 @@ static variant_t * get_most_frequent_variant(genome_t * ref_genome, struct frequ variant_t *var = (variant_t *)malloc(sizeof(variant_t)); var->score = frequency_table[nucId][genome_pos].score; // TODO: at the moment the number of matches and the score is the same. If we start having weights, we should store both the count and score in frequency table - var->depth = max; + var->depth = frequency_table[nucId][genome_pos].freq; var->ref[0] = nucleotide[ref_genome->data[genome_pos]]; var->ref[1] = '\0'; var->alt[0] = nucleotide[nucId]; From 5d3738ffe4b604238e9f90c05a2b15028cf71dff Mon Sep 17 00:00:00 2001 From: Julien Legriel Date: Tue, 2 Mar 2021 15:44:29 +0100 Subject: [PATCH 05/48] Reporting all variants > 20% frequency --- common/inc/common.h | 2 +- host/src/vartree.c | 57 ++++++++++++++++++++++----------------------- 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/common/inc/common.h b/common/inc/common.h index 686ccb4..637403a 100644 --- a/common/inc/common.h +++ b/common/inc/common.h @@ -18,7 +18,7 @@ #define MAX_DPU_RESULTS (1 << 19) #define MAX_RESULTS_PER_READ (1 << 10) -#define SIZE_READ 150 +#define SIZE_READ 120 #define SIZE_SEED 14 #define SIZE_NEIGHBOUR_IN_BYTES ((SIZE_READ - SIZE_SEED) / 4) #define DELTA_NEIGHBOUR(round) ((SIZE_SEED * round) / 4) diff --git a/host/src/vartree.c b/host/src/vartree.c index 0680788..350665c 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -203,47 +203,36 @@ static bool print_variant_tree(variant_t *var, uint32_t seq_nr, uint64_t seq_pos return true; } -static variant_t * get_most_frequent_variant(genome_t * ref_genome, struct frequency_info ** frequency_table, uint64_t genome_pos) { +static variant_t ** get_most_frequent_variant(genome_t * ref_genome, struct frequency_info ** frequency_table, uint64_t genome_pos) { static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; + variant_t** results = calloc(5, sizeof(variant_t*)); float total = 0; for(int i = 0; i < 5; ++i) { total += frequency_table[i][genome_pos].freq; } - float max = 0; - int8_t nucId = -1; for(int i = 0; i < 5; ++i) { float freq = frequency_table[i][genome_pos].freq; if(i == ref_genome->data[genome_pos]) continue; // not a variant if the same nucleotide as in reference genome if((freq / total > 0.2) && freq > 3) { // if frequency > 20% and depth > 3, consider it a variant - if(freq > max) { // keep variant of max frequency - max = freq; - nucId = i; - } + + // this is a substitution, create variant + variant_t *var = (variant_t *)malloc(sizeof(variant_t)); + var->score = frequency_table[i][genome_pos].score; + // TODO: at the moment the number of matches and the score is the same. If we start having weights, we should store both the count and score in frequency table + var->depth = frequency_table[i][genome_pos].freq; + var->ref[0] = nucleotide[ref_genome->data[genome_pos]]; + var->ref[1] = '\0'; + var->alt[0] = nucleotide[i]; + var->alt[1] = '\0'; + results[i] = var; } } //printf("get_most_frequent_variant: genome_pos %lu, nucleotide max freq %d %f %c\n", genome_pos, nucId, max, nucId >= 0 ? nucleotide[nucId] : '-'); - if(nucId >= 0) { - - assert(nucId < 4); - - // this is a substitution, create variant - variant_t *var = (variant_t *)malloc(sizeof(variant_t)); - var->score = frequency_table[nucId][genome_pos].score; - // TODO: at the moment the number of matches and the score is the same. If we start having weights, we should store both the count and score in frequency table - var->depth = frequency_table[nucId][genome_pos].freq; - var->ref[0] = nucleotide[ref_genome->data[genome_pos]]; - var->ref[1] = '\0'; - var->alt[0] = nucleotide[nucId]; - var->alt[1] = '\0'; - - return var; - } - - return NULL; + return results; } //TODO here read frequency table and write vcf (take max of frequency table to find substitution if any) @@ -284,6 +273,7 @@ void create_vcf() /* ####### END OF HEADER ####### */ struct frequency_info **frequency_table = get_frequency_table(); + uint32_t nb_pos_multiple_var = 0; /* for each sequence in the genome */ for (uint32_t seq_number = 0; seq_number < ref_genome->nb_seq; seq_number++) { @@ -291,10 +281,19 @@ void create_vcf() for (uint64_t seq_position = 0; seq_position < ref_genome->len_seq[seq_number]; seq_position++) { uint64_t genome_pos = ref_genome->pt_seq[seq_number] + seq_position; - variant_t *var = get_most_frequent_variant(ref_genome, frequency_table, genome_pos); - if(var) { - nb_variant += print_variant_tree(var, seq_number, seq_position, ref_genome, vcf_file) ? 1 : 0; + variant_t ** results = get_most_frequent_variant(ref_genome, frequency_table, genome_pos); + int nb_var = 0; + for(int i = 0; i < 5; ++i) { + variant_t * var = results[i]; + if(var) { + nb_variant += print_variant_tree(var, seq_number, seq_position, ref_genome, vcf_file) ? 1 : 0; + free(var); + nb_var++; + } + if(nb_var > 1) + nb_pos_multiple_var++; } + free(results); //variant_t *var = variant_list[seq_number][seq_position]; //while (var != NULL) { // nb_variant += print_variant_tree(var, seq_number, seq_position, ref_genome, vcf_file) ? 1 : 0; @@ -305,6 +304,6 @@ void create_vcf() fclose(vcf_file); - printf("\tnumber of variants: %d\n", nb_variant); + printf("\tnumber of variants: %d (multiple %d)\n", nb_variant, nb_pos_multiple_var); printf("\ttime: %lf s\n", my_clock() - start_time); } From d155195d2f867d261064e74f510939df2030b605 Mon Sep 17 00:00:00 2001 From: Julien Legriel Date: Tue, 2 Mar 2021 16:37:03 +0100 Subject: [PATCH 06/48] Remove the filter, free memory of variant --- common/inc/common.h | 2 +- host/inc/genome.h | 1 + host/src/genome.c | 11 +++++++++++ host/src/vartree.c | 14 ++++++-------- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/common/inc/common.h b/common/inc/common.h index 637403a..c73f2d3 100644 --- a/common/inc/common.h +++ b/common/inc/common.h @@ -18,7 +18,7 @@ #define MAX_DPU_RESULTS (1 << 19) #define MAX_RESULTS_PER_READ (1 << 10) -#define SIZE_READ 120 +#define SIZE_READ 148 #define SIZE_SEED 14 #define SIZE_NEIGHBOUR_IN_BYTES ((SIZE_READ - SIZE_SEED) / 4) #define DELTA_NEIGHBOUR(round) ((SIZE_SEED * round) / 4) diff --git a/host/inc/genome.h b/host/inc/genome.h index 14b2fe1..6bdf35d 100644 --- a/host/inc/genome.h +++ b/host/inc/genome.h @@ -36,5 +36,6 @@ struct frequency_info { unsigned int score; }; struct frequency_info** get_frequency_table(); +void free_frequency_table(); #endif /* __GENOME_H__ */ diff --git a/host/src/genome.c b/host/src/genome.c index 48b2d12..ca113bd 100644 --- a/host/src/genome.c +++ b/host/src/genome.c @@ -131,4 +131,15 @@ struct frequency_info** get_frequency_table() { return frequency_table; } +void free_frequency_table() { + + if(init_frequency_table) { + for(int i = 0; i < 5; ++i) { + free(frequency_table[i]); + } + init_frequency_table = false; + } +} + + diff --git a/host/src/vartree.c b/host/src/vartree.c index 350665c..d247fa4 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -69,6 +69,7 @@ typedef struct { uint32_t score; } depth_filter_t; +#if 0 #if (SIZE_READ == 120) depth_filter_t sub_filter[] = { [3] = { 15, 16 }, @@ -140,6 +141,7 @@ depth_filter_t indel_filter[] = { #else #error "Filter not defined for this size of read" #endif +#endif static bool homopolymer(int8_t *seq, int offset) { @@ -190,9 +192,9 @@ static bool print_variant_tree(variant_t *var, uint32_t seq_nr, uint64_t seq_pos } else if (depth > 11) { depth = 11; } - if (!(score <= indel_filter[depth].score && percentage >= indel_filter[depth].percentage)) { - return false; - } + //if (!(score <= indel_filter[depth].score && percentage >= indel_filter[depth].percentage)) { + // return false; + //} } print: @@ -294,14 +296,10 @@ void create_vcf() nb_pos_multiple_var++; } free(results); - //variant_t *var = variant_list[seq_number][seq_position]; - //while (var != NULL) { - // nb_variant += print_variant_tree(var, seq_number, seq_position, ref_genome, vcf_file) ? 1 : 0; - // var = var->next; - //} } } + free_frequency_table(); fclose(vcf_file); printf("\tnumber of variants: %d (multiple %d)\n", nb_variant, nb_pos_multiple_var); From ff6428944ccaf2bb5c5c1b09f677cfe4746273ac Mon Sep 17 00:00:00 2001 From: Julien Legriel Date: Wed, 10 Mar 2021 09:16:06 +0100 Subject: [PATCH 07/48] Changes to also select the best score when no pair with matching positions --- common/inc/common.h | 1 + host/src/processread.c | 31 ++++++++++++++++++++++++++++++- host/src/vartree.c | 23 ++++++++++++++--------- 3 files changed, 45 insertions(+), 10 deletions(-) diff --git a/common/inc/common.h b/common/inc/common.h index c73f2d3..882efff 100644 --- a/common/inc/common.h +++ b/common/inc/common.h @@ -19,6 +19,7 @@ #define MAX_RESULTS_PER_READ (1 << 10) #define SIZE_READ 148 +//#define SIZE_READ 120 #define SIZE_SEED 14 #define SIZE_NEIGHBOUR_IN_BYTES ((SIZE_READ - SIZE_SEED) / 4) #define DELTA_NEIGHBOUR(round) ((SIZE_SEED * round) / 4) diff --git a/host/src/processread.c b/host/src/processread.c index 20b4626..6fa52e6 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -503,8 +503,12 @@ static void do_process_read(process_read_arg_t *arg) unsigned int P1[1000]; unsigned int P2[1000]; unsigned int np = 0; + unsigned int P1_all[1000]; + unsigned int P2_all[1000]; + unsigned int np_all = 0; unsigned int pos1, pos2, t1, t2; unsigned int best_score = 1000; + unsigned int best_score_all = 1000; // test all significant pairs of reads (0,3) & (1,2) for (unsigned int x1 = i; x1 < j; x1++) { t1 = result_tab[x1].num % 4; @@ -514,7 +518,7 @@ static void do_process_read(process_read_arg_t *arg) t2 = result_tab[x2].num % 4; if (t1 + t2 == 3) // select significant pair { - if ((abs((int)pos2 - (int)pos1) > 130 && (abs((int)pos2 - (int)pos1) < 430))) { + if ((abs((int)pos2 - (int)pos1) > 50 && (abs((int)pos2 - (int)pos1) < 1000))) { if (result_tab[x1].score + result_tab[x2].score < best_score) { np = 0; best_score = result_tab[x1].score + result_tab[x2].score; @@ -530,6 +534,21 @@ static void do_process_read(process_read_arg_t *arg) } } } + else { + + if (result_tab[x1].score + result_tab[x2].score < best_score_all) { + np_all = 0; + best_score_all = result_tab[x1].score + result_tab[x2].score; + P1_all[np_all] = x1; + P2_all[np_all] = x2; + np_all++; + } else if (result_tab[x1].score + result_tab[x2].score == best_score_all) { + P1_all[np_all] = x1; + P2_all[np_all] = x2; + if (np_all < 999) + np_all++; + } + } } } } @@ -542,6 +561,16 @@ static void do_process_read(process_read_arg_t *arg) update_frequency_table(ref_genome, result_tab, reads_buffer, P1[i]); update_frequency_table(ref_genome, result_tab, reads_buffer, P2[i]); } + } + else if (np_all > 0) { + + // update frequency table + for (unsigned int i = 0; i < np_all; i++) { + + // for each matching position of this read in the reference genome, add +1 in the corresponding nucleotide column + update_frequency_table(ref_genome, result_tab, reads_buffer, P1_all[i]); + update_frequency_table(ref_genome, result_tab, reads_buffer, P2_all[i]); + } } else { pthread_mutex_lock(&nr_reads_mutex); nr_reads_non_mapped++; diff --git a/host/src/vartree.c b/host/src/vartree.c index d247fa4..b3be1ba 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -143,6 +143,7 @@ depth_filter_t indel_filter[] = { #endif #endif +#if 0 static bool homopolymer(int8_t *seq, int offset) { for (int i = 0; i < offset - 1; i++) { @@ -152,6 +153,7 @@ static bool homopolymer(int8_t *seq, int offset) } return true; } +#endif static bool print_variant_tree(variant_t *var, uint32_t seq_nr, uint64_t seq_pos, genome_t *ref_genome, FILE *vcf_file) { @@ -160,16 +162,16 @@ static bool print_variant_tree(variant_t *var, uint32_t seq_nr, uint64_t seq_pos uint32_t cov = ref_genome->mapping_coverage[genome_pos]; uint32_t depth = var->depth; uint32_t score = var->score / depth; - uint32_t percentage = 100; - if (cov != 0) { - percentage = depth * 100 / cov; - } + //uint32_t percentage = 100; + //if (cov != 0) { + // percentage = depth * 100 / cov; + //} uint32_t ref_len = strlen(var->ref); uint32_t alt_len = strlen(var->alt); - if (ref_len > alt_len && percentage <= 25 && homopolymer(&ref_genome->data[genome_pos - 12], 12)) { - return false; - } + //if (ref_len > alt_len && percentage <= 25 && homopolymer(&ref_genome->data[genome_pos - 12], 12)) { + // return false; + //} if (get_no_filter()) goto print; @@ -205,6 +207,9 @@ static bool print_variant_tree(variant_t *var, uint32_t seq_nr, uint64_t seq_pos return true; } +#define FREQUENCY_THRESHOLD 0.20 +#define DEPTH_THRESHOLD 2 + static variant_t ** get_most_frequent_variant(genome_t * ref_genome, struct frequency_info ** frequency_table, uint64_t genome_pos) { static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; @@ -218,12 +223,12 @@ static variant_t ** get_most_frequent_variant(genome_t * ref_genome, struct freq for(int i = 0; i < 5; ++i) { float freq = frequency_table[i][genome_pos].freq; if(i == ref_genome->data[genome_pos]) continue; // not a variant if the same nucleotide as in reference genome - if((freq / total > 0.2) && freq > 3) { // if frequency > 20% and depth > 3, consider it a variant + if((freq / total > FREQUENCY_THRESHOLD) + && freq > DEPTH_THRESHOLD) { // if frequency > 20% and depth > 3, consider it a variant // this is a substitution, create variant variant_t *var = (variant_t *)malloc(sizeof(variant_t)); var->score = frequency_table[i][genome_pos].score; - // TODO: at the moment the number of matches and the score is the same. If we start having weights, we should store both the count and score in frequency table var->depth = frequency_table[i][genome_pos].freq; var->ref[0] = nucleotide[ref_genome->data[genome_pos]]; var->ref[1] = '\0'; From aa0df86c28b2aabe03cce97ea6cd7b7f60c70051 Mon Sep 17 00:00:00 2001 From: Julien Legriel Date: Wed, 17 Mar 2021 11:34:07 +0100 Subject: [PATCH 08/48] Implementation of mapq score --- host/CMakeLists.txt | 2 +- host/src/processread.c | 278 +++++++++++++++++++++++++++++------------ host/src/vartree.c | 11 +- tests/compareVCF.py | 3 + 4 files changed, 210 insertions(+), 84 deletions(-) diff --git a/host/CMakeLists.txt b/host/CMakeLists.txt index c083595..1bdbfb6 100644 --- a/host/CMakeLists.txt +++ b/host/CMakeLists.txt @@ -21,7 +21,7 @@ file(GLOB_RECURSE SOURCES src/*.c) add_executable(upvc ${SOURCES}) target_include_directories(upvc PUBLIC "${DPU_HOST_INCLUDE_DIRECTORIES}" inc/ ../common/inc/) -target_link_libraries(upvc ${DPU_HOST_LIBRARIES} pthread) +target_link_libraries(upvc ${DPU_HOST_LIBRARIES} pthread m) set(NB_DPU_MARK) if (NB_DPU) diff --git a/host/src/processread.c b/host/src/processread.c index 6fa52e6..cfdadc0 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "accumulateread.h" #include "genome.h" @@ -34,6 +35,7 @@ #define PATH_SUBSTITUTION (0) #define PATH_INSERTION (1) #define PATH_DELETION (2) +#define MAX_SUBSTITUTION (4) typedef struct { int type; @@ -413,7 +415,8 @@ static void update_frequency_table( genome_t *ref_genome, dpu_result_out_t *result_tab, int8_t *reads_buffer, - int pos) { + int pos, + float mapq) { struct frequency_info **frequency_table = get_frequency_table(); uint64_t genome_pos = ref_genome->pt_seq[result_tab[pos].coord.seq_nr] + result_tab[pos].coord.seed_nr; @@ -421,7 +424,7 @@ static void update_frequency_table( int8_t *read = reads_buffer + (num * SIZE_READ); for(int j = 0; j < SIZE_READ; ++j) { if(genome_pos + j < genome_get()->fasta_file_size) { - frequency_table[read[j]][genome_pos+j].freq++; + frequency_table[read[j]][genome_pos+j].freq += mapq; frequency_table[read[j]][genome_pos+j].score += result_tab[pos].score; } else @@ -459,6 +462,33 @@ static uint64_t nr_reads_total = 0ULL; static uint64_t nr_reads_non_mapped = 0ULL; static pthread_mutex_t nr_reads_mutex = PTHREAD_MUTEX_INITIALIZER; +#define MISMATCH_COUNT(X) (X.score / 10) +#define DIST_PAIR_THRESHOLD 0 +#define DIST_SINGLE_THRESHOLD 0 +#define MAPQ_SCALING_FACTOR 2 + +static void keep_best_2_scores(unsigned score, unsigned* P1, unsigned *P2, unsigned x1, unsigned x2, unsigned* best_score) { + + if(score < best_score[0]) { + + // move current to next position + best_score[1] = best_score[0]; + P1[1] = P1[0]; + P2[1] = P2[0]; + // update first position + best_score[0] = score; + P1[0] = x1; + P2[0] = x2; + } + else if (score < best_score[1]) { + + // update second position + best_score[1] = score; + P1[1] = x1; + P2[1] = x2; + } +} + static void do_process_read(process_read_arg_t *arg) { const unsigned int nb_match = arg->nb_match; @@ -485,93 +515,176 @@ static void do_process_read(process_read_arg_t *arg) */ while (true) { - unsigned int i; - if ((i = acquire_curr_match()) >= nb_match) { - release_curr_match(i); - return; + unsigned int i; + if ((i = acquire_curr_match()) >= nb_match) { + release_curr_match(i); + return; + } + int numpair = result_tab[i].num / 4; + unsigned int j = i; + while ((j < nb_match) && (numpair == result_tab[j].num / 4)) { + j++; + } + release_curr_match(j); + + // i = start index in result_tab + // j = stop index in result_tab + // select best couples of paired reads + unsigned int P1[2]; + unsigned int P2[2]; + //unsigned int P1_all[1000]; + //unsigned int P2_all[1000]; + //unsigned int np_all = 0; + unsigned int pos1, pos2, t1, t2; + unsigned int best_score[2] = { 1000, 1000 }; + /*unsigned int best_score_all = 1000;*/ + // test all significant pairs of reads (0,3) & (1,2) + for (unsigned int x1 = i; x1 < j; x1++) { + t1 = result_tab[x1].num % 4; + pos1 = result_tab[x1].coord.seed_nr; + for (unsigned int x2 = i + 1; x2 < j; x2++) { + pos2 = result_tab[x2].coord.seed_nr; + t2 = result_tab[x2].num % 4; + if (t1 + t2 == 3) // select significant pair + { + if ((abs((int)pos2 - (int)pos1) > 50 && (abs((int)pos2 - (int)pos1) < 2000))) { + // update if this is one of the two best scores + keep_best_2_scores(result_tab[x1].score + result_tab[x2].score, P1, P2, x1, x2, best_score); + } + } } - int numpair = result_tab[i].num / 4; - unsigned int j = i; - while ((j < nb_match) && (numpair == result_tab[j].num / 4)) { - j++; + } + + bool update = false; + unsigned np = 0; + if(best_score[0] < 1000) { + np++; + if(best_score[1] < 1000) np++; + } + if (np > 0) { + + if(np == 2) { + + // found at least 2 matching pairs of positions. Check the delta between the two pairs to + // decide whether we should keep the best pair + int delta = abs((MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]])) + - (MISMATCH_COUNT(result_tab[P1[1]]) + MISMATCH_COUNT(result_tab[P2[1]]))); + + int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + + MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((2 * (MAX_SUBSTITUTION + 1)) - delta); + if(delta_corrected < 0) { + printf("WARNING: negative delta for square root %d\n", delta_corrected); + } + else if(delta > DIST_PAIR_THRESHOLD) { + float mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); + update_frequency_table(ref_genome, result_tab, reads_buffer, P1[0], mapq); + update_frequency_table(ref_genome, result_tab, reads_buffer, P2[0], mapq); + update = true; + } } - release_curr_match(j); - - // i = start index in result_tab - // j = stop index in result_tab - // select best couples of paired reads - unsigned int P1[1000]; - unsigned int P2[1000]; - unsigned int np = 0; - unsigned int P1_all[1000]; - unsigned int P2_all[1000]; - unsigned int np_all = 0; - unsigned int pos1, pos2, t1, t2; - unsigned int best_score = 1000; - unsigned int best_score_all = 1000; - // test all significant pairs of reads (0,3) & (1,2) - for (unsigned int x1 = i; x1 < j; x1++) { - t1 = result_tab[x1].num % 4; - pos1 = result_tab[x1].coord.seed_nr; - for (unsigned int x2 = i + 1; x2 < j; x2++) { - pos2 = result_tab[x2].coord.seed_nr; - t2 = result_tab[x2].num % 4; - if (t1 + t2 == 3) // select significant pair - { - if ((abs((int)pos2 - (int)pos1) > 50 && (abs((int)pos2 - (int)pos1) < 1000))) { - if (result_tab[x1].score + result_tab[x2].score < best_score) { - np = 0; - best_score = result_tab[x1].score + result_tab[x2].score; - P1[np] = x1; - P2[np] = x2; - np++; - } else { - if (result_tab[x1].score + result_tab[x2].score == best_score) { - P1[np] = x1; - P2[np] = x2; - if (np < 999) - np++; - } - } - } - else { - - if (result_tab[x1].score + result_tab[x2].score < best_score_all) { - np_all = 0; - best_score_all = result_tab[x1].score + result_tab[x2].score; - P1_all[np_all] = x1; - P2_all[np_all] = x2; - np_all++; - } else if (result_tab[x1].score + result_tab[x2].score == best_score_all) { - P1_all[np_all] = x1; - P2_all[np_all] = x2; - if (np_all < 999) - np_all++; - } - } - } - } + else if(np) { // only one result, take it + int delta = abs((MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]])) - (2 * (MAX_SUBSTITUTION + 1))); + int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((2 * (MAX_SUBSTITUTION + 1)) - delta); + if(delta_corrected < 0) { + printf("WARNING: negative delta (np == 1) for square root %d\n", delta_corrected); + } + else if(delta > DIST_PAIR_THRESHOLD) { + float mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); + update_frequency_table(ref_genome, result_tab, reads_buffer, P1[0], mapq); + update_frequency_table(ref_genome, result_tab, reads_buffer, P2[0], mapq); + update = true; + } + //update_frequency_table(ref_genome, result_tab, reads_buffer, P1[0]); + //update_frequency_table(ref_genome, result_tab, reads_buffer, P2[0]); + //update = true; + } + } + if(true) { + + // check mapping of R1 and R2 independently + unsigned int best_score_R1[2] = { 1000, 1000 }; + unsigned int best_score_R2[2] = { 1000, 1000 }; + P1[0] = 0; + P2[0] = 0; + P1[1] = 0; + P2[1] = 0; + for (unsigned int read = i; read < j; read++) { + unsigned t1 = result_tab[read].num % 4; + if(t1 < 2) { // PE1 or RPE1 + keep_best_2_scores(result_tab[read].score, P1, P2, read, 0, best_score_R1); + } + else { // PE2 or RPE2 + keep_best_2_scores(result_tab[read].score, P1, P2, 0, read, best_score_R2); + } } - if (np > 0) { - - // update frequency table - for (unsigned int i = 0; i < np; i++) { - // for each matching position of this read in the reference genome, add +1 in the corresponding nucleotide column - update_frequency_table(ref_genome, result_tab, reads_buffer, P1[i]); - update_frequency_table(ref_genome, result_tab, reads_buffer, P2[i]); - } + unsigned np1 = 0, np2 = 0; + if(best_score_R1[0] < 1000) np1++; + if(best_score_R1[1] < 1000) np1++; + if(best_score_R2[0] < 1000) np2++; + if(best_score_R2[1] < 1000) np2++; + if(np1 == 2) { + + int delta = abs(MISMATCH_COUNT(result_tab[P1[0]]) - MISMATCH_COUNT(result_tab[P1[1]])); + + int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); + + if(delta_corrected < 0) { + printf("WARNING: negative delta (np1 == 2) for square root %d\n", delta_corrected); + } + else if(delta > DIST_SINGLE_THRESHOLD) { + float mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); + update_frequency_table(ref_genome, result_tab, reads_buffer, P1[0], mapq); + update = true; + } + } + else if(np1) { + int delta = abs(MISMATCH_COUNT(result_tab[P1[0]]) - (MAX_SUBSTITUTION + 1)); + + int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); + + if(delta_corrected < 0) { + printf("WARNING: negative delta (np1 == 1) for square root %d\n", delta_corrected); + } + else if(delta > DIST_SINGLE_THRESHOLD) { + float mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); + update_frequency_table(ref_genome, result_tab, reads_buffer, P1[0], mapq); + update = true; + } } - else if (np_all > 0) { - // update frequency table - for (unsigned int i = 0; i < np_all; i++) { + if(np2 == 2) { - // for each matching position of this read in the reference genome, add +1 in the corresponding nucleotide column - update_frequency_table(ref_genome, result_tab, reads_buffer, P1_all[i]); - update_frequency_table(ref_genome, result_tab, reads_buffer, P2_all[i]); - } - } else { + int delta = abs(MISMATCH_COUNT(result_tab[P2[0]]) - MISMATCH_COUNT(result_tab[P2[1]])); + + int delta_corrected = MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); + + if(delta_corrected < 0) { + printf("WARNING: negative delta (np2 == 2) for square root %d, %d %d %d %d\n", + delta_corrected, MISMATCH_COUNT(result_tab[P2[0]]), MISMATCH_COUNT(result_tab[P2[1]]), MAX_SUBSTITUTION + 1, delta); + } + else if(delta > DIST_SINGLE_THRESHOLD) { + float mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); + + update_frequency_table(ref_genome, result_tab, reads_buffer, P2[0], mapq); + update = true; + } + } + else if(np2) { + int delta = abs(MISMATCH_COUNT(result_tab[P2[0]]) - (MAX_SUBSTITUTION + 1)); + + int delta_corrected = MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); + + if(delta_corrected < 0) { + printf("WARNING: negative delta (np2 == 1) for square root %d\n", delta_corrected); + } + else if (delta > DIST_SINGLE_THRESHOLD) { + float mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); + update_frequency_table(ref_genome, result_tab, reads_buffer, P2[0], mapq); + update = true; + } + } + if(!update) { pthread_mutex_lock(&nr_reads_mutex); nr_reads_non_mapped++; pthread_mutex_unlock(&nr_reads_mutex); @@ -580,6 +693,7 @@ static void do_process_read(process_read_arg_t *arg) pthread_mutex_lock(&nr_reads_mutex); nr_reads_total++; pthread_mutex_unlock(&nr_reads_mutex); + } } } diff --git a/host/src/vartree.c b/host/src/vartree.c index b3be1ba..8a987c6 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -207,9 +207,11 @@ static bool print_variant_tree(variant_t *var, uint32_t seq_nr, uint64_t seq_pos return true; } -#define FREQUENCY_THRESHOLD 0.20 +#define FREQUENCY_THRESHOLD 0.15 #define DEPTH_THRESHOLD 2 +FILE * dbg_file = NULL; + static variant_t ** get_most_frequent_variant(genome_t * ref_genome, struct frequency_info ** frequency_table, uint64_t genome_pos) { static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; @@ -237,6 +239,9 @@ static variant_t ** get_most_frequent_variant(genome_t * ref_genome, struct freq results[i] = var; } } + fprintf(dbg_file, "pos: %lu ref nucleotide %c frequencies: A:%f C:%f T:%f G:%f\n", genome_pos, nucleotide[ref_genome->data[genome_pos]], + frequency_table[0][genome_pos].freq, frequency_table[1][genome_pos].freq, + frequency_table[2][genome_pos].freq, frequency_table[3][genome_pos].freq); //printf("get_most_frequent_variant: genome_pos %lu, nucleotide max freq %d %f %c\n", genome_pos, nucId, max, nucId >= 0 ? nucleotide[nucId] : '-'); return results; @@ -282,6 +287,8 @@ void create_vcf() struct frequency_info **frequency_table = get_frequency_table(); uint32_t nb_pos_multiple_var = 0; + dbg_file = fopen("freq_debug.txt", "w"); + /* for each sequence in the genome */ for (uint32_t seq_number = 0; seq_number < ref_genome->nb_seq; seq_number++) { /* for each position in the sequence */ @@ -304,6 +311,8 @@ void create_vcf() } } + fclose(dbg_file); + free_frequency_table(); fclose(vcf_file); diff --git a/tests/compareVCF.py b/tests/compareVCF.py index 0678c85..767b355 100755 --- a/tests/compareVCF.py +++ b/tests/compareVCF.py @@ -256,12 +256,15 @@ def compute(V1, V2, tp_stat, fp_stat): tp = 0 fp = 0 for (chr, pos), v1_infos in V1.items(): + prev_fp = fp if (chr, pos) in V2: tp, fp = compute_for_pos( v1_infos, V2[(chr, pos)], tp, fp, tp_stat, fp_stat) else: fp += len(v1_infos) update_stat_for_pos(fp_stat, v1_infos) + #if(fp > prev_fp): + # print chr, pos return tp, fp From f9f6f7763ec8f244334592d4fb48414adc81cd96 Mon Sep 17 00:00:00 2001 From: Julien Legriel Date: Fri, 26 Mar 2021 09:13:07 +0100 Subject: [PATCH 09/48] Add some filters and read of quality information --- dpu/src/task.c | 17 ++++---- host/inc/getread.h | 1 + host/src/getread.c | 59 ++++++++++++++++++++++----- host/src/processread.c | 72 ++++++++++++++++++++++++-------- host/src/vartree.c | 93 +++++++++++++++++++++++++++++++++++++----- tests/compareVCF.py | 22 +++++----- 6 files changed, 209 insertions(+), 55 deletions(-) diff --git a/dpu/src/task.c b/dpu/src/task.c index b561666..18c945a 100644 --- a/dpu/src/task.c +++ b/dpu/src/task.c @@ -121,15 +121,18 @@ static void compare_neighbours(sysname_t tasklet_id, uint32_t *mini, coords_and_ STATS_STORE_NODP_TIME(tasklet_stats, (end + acc - start)); STATS_INCR_NB_NODP_CALLS(*tasklet_stats); - //if (score_nodp == UINT_MAX) { - // STATS_GET_START_TIME(start, acc, end); + //TODO uncomment for indel +#ifdef USE_INDEL + if (score_nodp == UINT_MAX) { + STATS_GET_START_TIME(start, acc, end); - // score_odpd = score = odpd(current_read_nbr, ref_nbr, *mini, NB_BYTES_TO_SYMS(SIZE_NEIGHBOUR_IN_BYTES, DPU_MRAM_INFO_VAR)); + score_odpd = score = odpd(current_read_nbr, ref_nbr, *mini, NB_BYTES_TO_SYMS(SIZE_NEIGHBOUR_IN_BYTES, DPU_MRAM_INFO_VAR)); - // STATS_GET_END_TIME(end, acc); - // STATS_STORE_ODPD_TIME(tasklet_stats, (end + acc - start)); - // STATS_INCR_NB_ODPD_CALLS(*tasklet_stats); - //} + STATS_GET_END_TIME(end, acc); + STATS_STORE_ODPD_TIME(tasklet_stats, (end + acc - start)); + STATS_INCR_NB_ODPD_CALLS(*tasklet_stats); + } +#endif if (score > *mini) { return; diff --git a/host/inc/getread.h b/host/inc/getread.h index e2654ea..58b8253 100644 --- a/host/inc/getread.h +++ b/host/inc/getread.h @@ -11,6 +11,7 @@ int get_reads_in_buffer(unsigned int pass_id); int8_t *get_reads_buffer(unsigned int pass_id); +float *get_reads_quality_buffer(unsigned int pass_id); void get_reads(FILE *fpe1, FILE *fpe2, unsigned int pass_id); diff --git a/host/src/getread.c b/host/src/getread.c index e3a8197..943a320 100644 --- a/host/src/getread.c +++ b/host/src/getread.c @@ -4,9 +4,11 @@ #include #include +#include #include #include #include +#include #include "common.h" #include "getread.h" @@ -17,18 +19,24 @@ static int nb_reads[NB_READS_BUFFER]; static int8_t *reads_buffers[NB_READS_BUFFER]; +static float *reads_quality_buffers[NB_READS_BUFFER]; +static float quality_lookup_table[43]; +static bool lookup = false; #define PASS(pass_id) (pass_id % NB_READS_BUFFER) +#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) + /** * @brief Parse the file "f" to get the next read in the file and its pair. * - * @param f File to parse. - * @param read1 Output the next read in the file. - * @param read2 Output the pair of the next read in the file. + * @param f File to parse. + * @param read1 Output the next read in the file. + * @param read2 Output the pair of the next read in the file. + * @param read_quality_factor Output the quality factor of the read. * * @return The size of the read. */ -static int get_seq_fast_AQ(FILE *f, int8_t *read1, int8_t *read2) +static int get_seq_fast_AQ(FILE *f, int8_t *read1, int8_t *read2, float *read_quality_factor) { static const int invnt[4] = { 2, 3, 0, 1 }; int offset = 0; @@ -46,10 +54,10 @@ static int get_seq_fast_AQ(FILE *f, int8_t *read1, int8_t *read2) * it means that we need the skip the first 14 characters of the read. */ if (comment[1] == '>') { - sscanf(&comment[2], "%d", &offset); - } - int i; - for (i = 0; i < SIZE_READ - offset; i++) { + sscanf(&comment[2], "%d", &offset); + } + int i; + for (i = 0; i < SIZE_READ - offset; i++) { read1[i] = (((int)sequence_buffer[i]) >> 1) & 3; read2[SIZE_READ - i - 1 - offset] = invnt[read1[i]]; } @@ -64,9 +72,20 @@ static int get_seq_fast_AQ(FILE *f, int8_t *read1, int8_t *read2) if (fgets(comment, MAX_BUF_SIZE, f) == NULL) { /* Commentary */ return -1; } - if (fgets(sequence_buffer, MAX_SEQ_SIZE, f) == NULL) { /* Line with sequence quality information (unused) */ + if (fgets(sequence_buffer, MAX_SEQ_SIZE, f) == NULL) { /* Line with sequence quality information */ return -1; } + //TODO: store quality information + for (i = 0; i < SIZE_READ - offset; i++) { + int Q = sequence_buffer[i]; + /*printf("index %d SIZE_READ %d\n", i, SIZE_READ);*/ + /*fflush(stdout);*/ + read_quality_factor[i] = quality_lookup_table[Q-33]; + } + for (; i < SIZE_READ; i++) { + read_quality_factor[i] = 1.0f; + } + return SIZE_READ; } @@ -75,16 +94,33 @@ void get_reads(FILE *fpe1, FILE *fpe2, unsigned int pass_id) int nb_read = 0; pass_id = PASS(pass_id); + if(!lookup) { + + for(uint8_t i = 33; i < 76; ++i) { + quality_lookup_table[i - 33] = MAX(0.001f, 1.0f - pow(10.0f, -(i-33)/10.0f)); + printf("%d %f\n", i, quality_lookup_table[i - 33]); + } + lookup = true; + } + int8_t *reads_buffer = reads_buffers[pass_id]; + float* reads_quality_buffer = reads_quality_buffers[pass_id]; if (reads_buffer == NULL) { reads_buffer = (int8_t *)malloc(MAX_READS_BUFFER * SIZE_READ); + reads_quality_buffer = (float *)malloc(MAX_READS_BUFFER/2 * SIZE_READ * sizeof(float)); assert(reads_buffer != NULL); + assert(reads_quality_buffer != NULL); reads_buffers[pass_id] = reads_buffer; + reads_quality_buffers[pass_id] = reads_quality_buffer; } while (nb_read < MAX_READS_BUFFER) { - if ((get_seq_fast_AQ(fpe1, &reads_buffer[(nb_read + 0) * SIZE_READ], &reads_buffer[(nb_read + 1) * SIZE_READ]) <= 0) - || (get_seq_fast_AQ(fpe2, &reads_buffer[(nb_read + 2) * SIZE_READ], &reads_buffer[(nb_read + 3) * SIZE_READ]) <= 0)) + /*assert(nb_read/2 + 1 < MAX_READS_BUFFER/2);*/ + /*printf("nb_read %d MAX_READS_BUFFER %d\n", nb_read, MAX_READS_BUFFER);*/ + if ((get_seq_fast_AQ(fpe1, &reads_buffer[(nb_read + 0) * SIZE_READ], &reads_buffer[(nb_read + 1) * SIZE_READ], + &reads_quality_buffer[nb_read/2 * SIZE_READ]) <= 0) + || (get_seq_fast_AQ(fpe2, &reads_buffer[(nb_read + 2) * SIZE_READ], &reads_buffer[(nb_read + 3) * SIZE_READ], + &reads_quality_buffer[(nb_read/2 + 1) * SIZE_READ]) <= 0)) break; nb_read += 4; } @@ -95,6 +131,7 @@ void get_reads(FILE *fpe1, FILE *fpe2, unsigned int pass_id) int get_reads_in_buffer(unsigned int pass_id) { return nb_reads[PASS(pass_id)]; } int8_t *get_reads_buffer(unsigned int pass_id) { return reads_buffers[PASS(pass_id)]; } +float *get_reads_quality_buffer(unsigned int pass_id) { return reads_quality_buffers[PASS(pass_id)]; } int get_input_info(FILE *f, size_t *read_size, size_t *nb_read) { diff --git a/host/src/processread.c b/host/src/processread.c index cfdadc0..d8d7e49 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -415,6 +415,7 @@ static void update_frequency_table( genome_t *ref_genome, dpu_result_out_t *result_tab, int8_t *reads_buffer, + float *reads_quality_buffer, int pos, float mapq) { @@ -422,9 +423,13 @@ static void update_frequency_table( uint64_t genome_pos = ref_genome->pt_seq[result_tab[pos].coord.seq_nr] + result_tab[pos].coord.seed_nr; int num = result_tab[pos].num; int8_t *read = reads_buffer + (num * SIZE_READ); + float *read_quality = reads_quality_buffer + (num/2 * SIZE_READ); + //TODO: read the quality in the correct order (inverted or not) + bool inv = num & 1; + //TODO: assume no offset here for(int j = 0; j < SIZE_READ; ++j) { if(genome_pos + j < genome_get()->fasta_file_size) { - frequency_table[read[j]][genome_pos+j].freq += mapq; + frequency_table[read[j]][genome_pos+j].freq += mapq * read_quality[inv ? SIZE_READ - j - 1 : j]; frequency_table[read[j]][genome_pos+j].score += result_tab[pos].score; } else @@ -453,6 +458,7 @@ typedef struct { dpu_result_out_t *result_tab; int round; int8_t *reads_buffer; + float *reads_quality_buffer; genome_t *ref_genome; FILE *fpe1; FILE *fpe2; @@ -463,7 +469,7 @@ static uint64_t nr_reads_non_mapped = 0ULL; static pthread_mutex_t nr_reads_mutex = PTHREAD_MUTEX_INITIALIZER; #define MISMATCH_COUNT(X) (X.score / 10) -#define DIST_PAIR_THRESHOLD 0 +#define DIST_PAIR_THRESHOLD 1 #define DIST_SINGLE_THRESHOLD 0 #define MAPQ_SCALING_FACTOR 2 @@ -489,12 +495,14 @@ static void keep_best_2_scores(unsigned score, unsigned* P1, unsigned *P2, unsig } } +/*#define USE_MAPQ_SCORE*/ static void do_process_read(process_read_arg_t *arg) { const unsigned int nb_match = arg->nb_match; dpu_result_out_t *result_tab = arg->result_tab; int round = arg->round; int8_t *reads_buffer = arg->reads_buffer; + float *reads_quality_buffer = arg->reads_quality_buffer; genome_t *ref_genome = arg->ref_genome; FILE *fpe1 = arg->fpe1; FILE *fpe2 = arg->fpe2; @@ -570,28 +578,38 @@ static void do_process_read(process_read_arg_t *arg) int delta = abs((MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]])) - (MISMATCH_COUNT(result_tab[P1[1]]) + MISMATCH_COUNT(result_tab[P2[1]]))); + float mapq = 1.0f; +#ifdef USE_MAPQ_SCORE int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((2 * (MAX_SUBSTITUTION + 1)) - delta); if(delta_corrected < 0) { printf("WARNING: negative delta for square root %d\n", delta_corrected); } else if(delta > DIST_PAIR_THRESHOLD) { - float mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); - update_frequency_table(ref_genome, result_tab, reads_buffer, P1[0], mapq); - update_frequency_table(ref_genome, result_tab, reads_buffer, P2[0], mapq); + mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); +#else + if(delta > DIST_PAIR_THRESHOLD) { +#endif + update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); + update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); update = true; } } else if(np) { // only one result, take it int delta = abs((MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]])) - (2 * (MAX_SUBSTITUTION + 1))); + float mapq = 1.0f; +#ifdef USE_MAPQ_SCORE int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((2 * (MAX_SUBSTITUTION + 1)) - delta); if(delta_corrected < 0) { printf("WARNING: negative delta (np == 1) for square root %d\n", delta_corrected); } else if(delta > DIST_PAIR_THRESHOLD) { - float mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); - update_frequency_table(ref_genome, result_tab, reads_buffer, P1[0], mapq); - update_frequency_table(ref_genome, result_tab, reads_buffer, P2[0], mapq); + mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); +#else + if(delta > DIST_PAIR_THRESHOLD) { +#endif + update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); + update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); update = true; } //update_frequency_table(ref_genome, result_tab, reads_buffer, P1[0]); @@ -627,28 +645,38 @@ static void do_process_read(process_read_arg_t *arg) int delta = abs(MISMATCH_COUNT(result_tab[P1[0]]) - MISMATCH_COUNT(result_tab[P1[1]])); + float mapq = 1.0f; +#ifdef USE_MAPQ_SCORE int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); if(delta_corrected < 0) { printf("WARNING: negative delta (np1 == 2) for square root %d\n", delta_corrected); } else if(delta > DIST_SINGLE_THRESHOLD) { - float mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); - update_frequency_table(ref_genome, result_tab, reads_buffer, P1[0], mapq); + mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); +#else + if(delta > DIST_SINGLE_THRESHOLD) { +#endif + update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); update = true; } } else if(np1) { int delta = abs(MISMATCH_COUNT(result_tab[P1[0]]) - (MAX_SUBSTITUTION + 1)); + float mapq = 1.0f; +#ifdef USE_MAPQ_SCORE int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); if(delta_corrected < 0) { printf("WARNING: negative delta (np1 == 1) for square root %d\n", delta_corrected); } else if(delta > DIST_SINGLE_THRESHOLD) { - float mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); - update_frequency_table(ref_genome, result_tab, reads_buffer, P1[0], mapq); + mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); +#else + if(delta > DIST_SINGLE_THRESHOLD) { +#endif + update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); update = true; } } @@ -657,6 +685,8 @@ static void do_process_read(process_read_arg_t *arg) int delta = abs(MISMATCH_COUNT(result_tab[P2[0]]) - MISMATCH_COUNT(result_tab[P2[1]])); + float mapq = 1.0f; +#ifdef USE_MAPQ_SCORE int delta_corrected = MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); if(delta_corrected < 0) { @@ -664,23 +694,31 @@ static void do_process_read(process_read_arg_t *arg) delta_corrected, MISMATCH_COUNT(result_tab[P2[0]]), MISMATCH_COUNT(result_tab[P2[1]]), MAX_SUBSTITUTION + 1, delta); } else if(delta > DIST_SINGLE_THRESHOLD) { - float mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); + mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); +#else + if(delta > DIST_SINGLE_THRESHOLD) { +#endif - update_frequency_table(ref_genome, result_tab, reads_buffer, P2[0], mapq); + update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); update = true; } } else if(np2) { int delta = abs(MISMATCH_COUNT(result_tab[P2[0]]) - (MAX_SUBSTITUTION + 1)); + float mapq = 1.0f; +#ifdef USE_MAPQ_SCORE int delta_corrected = MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); if(delta_corrected < 0) { printf("WARNING: negative delta (np2 == 1) for square root %d\n", delta_corrected); } else if (delta > DIST_SINGLE_THRESHOLD) { - float mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); - update_frequency_table(ref_genome, result_tab, reads_buffer, P2[0], mapq); + mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); +#else + if (delta > DIST_SINGLE_THRESHOLD) { +#endif + update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); update = true; } } @@ -706,6 +744,7 @@ static bool stop_threads = false; void process_read(FILE *fpe1, FILE *fpe2, int round, unsigned int pass_id) { int8_t *reads_buffer = get_reads_buffer(pass_id); + float *reads_quality_buffer = get_reads_quality_buffer(pass_id); acc_results_t acc_res = accumulate_get_result(pass_id); curr_match = 0; @@ -714,6 +753,7 @@ void process_read(FILE *fpe1, FILE *fpe2, int round, unsigned int pass_id) args.result_tab = acc_res.results; args.round = round; args.reads_buffer = reads_buffer; + args.reads_quality_buffer = reads_quality_buffer; args.fpe1 = fpe1; args.fpe2 = fpe2; diff --git a/host/src/vartree.c b/host/src/vartree.c index 8a987c6..c87b06f 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "common.h" #include "genome.h" @@ -207,15 +208,64 @@ static bool print_variant_tree(variant_t *var, uint32_t seq_nr, uint64_t seq_pos return true; } -#define FREQUENCY_THRESHOLD 0.15 -#define DEPTH_THRESHOLD 2 +#define FREQUENCY_THRESHOLD 0.20 +#define DEPTH_THRESHOLD 3 +/* +great that you have implemented the quality score weights in the frequency table. I think using D=1 is not advisable since the number of false positives becomes extremely high, also with D=2 we need to be very careful. Thus, I would recommend to test the accuracy using Q-scores with our previously used parameters first (e.g. D>=3 and 20%, and the ones I have suggested in (1) (i, ii, and iii) and compare them to our old results. + +(i) D=3: 25%, D=4: 20%, D=5: 15%; D>=6: 10% +(ii) D=2: 30%, D=3: 25%, D=4: 20%, D=5: 15%; D>=6: 10% +(iii) D=3: 20%, D=4: 15%, D>=5: 10%; + * + * */ + +__attribute__((unused)) int32_t depth_filter1(float freq) { + if(freq < 10.0f) + return UINT_MAX; + if(freq < 15.0f) + return 6; + if(freq < 20.0f) + return 5; + if(freq < 25.0f) + return 4; + return 5; +}; + +__attribute__((unused)) int32_t depth_filter2(float freq) { + if(freq < 10.0f) + return UINT_MAX; + if(freq < 15.0f) + return 6; + if(freq < 20.0f) + return 5; + if(freq < 25.0f) + return 4; + if(freq < 30.0f) + return 3; + return 2; +}; + +__attribute__((unused)) int32_t depth_filter3(float freq) { + if(freq < 10.0f) + return UINT_MAX; + if(freq < 15.0f) + return 5; + if(freq < 20.0f) + return 4; + return 3; +}; + +#define depth_filter depth_filter1 FILE * dbg_file = NULL; +FILE * sub_file = NULL; -static variant_t ** get_most_frequent_variant(genome_t * ref_genome, struct frequency_info ** frequency_table, uint64_t genome_pos) { +static variant_t ** get_most_frequent_variant(genome_t * ref_genome, struct frequency_info ** frequency_table, uint32_t seq_number, uint64_t seq_position) { static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; + uint64_t genome_pos = ref_genome->pt_seq[seq_number] + seq_position; + variant_t** results = calloc(5, sizeof(variant_t*)); float total = 0; for(int i = 0; i < 5; ++i) { @@ -226,7 +276,7 @@ static variant_t ** get_most_frequent_variant(genome_t * ref_genome, struct freq float freq = frequency_table[i][genome_pos].freq; if(i == ref_genome->data[genome_pos]) continue; // not a variant if the same nucleotide as in reference genome if((freq / total > FREQUENCY_THRESHOLD) - && freq > DEPTH_THRESHOLD) { // if frequency > 20% and depth > 3, consider it a variant + && freq > depth_filter(freq)) { // if frequency and depth pass the threshold, consider it a variant // this is a substitution, create variant variant_t *var = (variant_t *)malloc(sizeof(variant_t)); @@ -239,9 +289,6 @@ static variant_t ** get_most_frequent_variant(genome_t * ref_genome, struct freq results[i] = var; } } - fprintf(dbg_file, "pos: %lu ref nucleotide %c frequencies: A:%f C:%f T:%f G:%f\n", genome_pos, nucleotide[ref_genome->data[genome_pos]], - frequency_table[0][genome_pos].freq, frequency_table[1][genome_pos].freq, - frequency_table[2][genome_pos].freq, frequency_table[3][genome_pos].freq); //printf("get_most_frequent_variant: genome_pos %lu, nucleotide max freq %d %f %c\n", genome_pos, nucId, max, nucId >= 0 ? nucleotide[nucId] : '-'); return results; @@ -288,14 +335,40 @@ void create_vcf() uint32_t nb_pos_multiple_var = 0; dbg_file = fopen("freq_debug.txt", "w"); + sub_file = fopen("subst.txt", "r"); + unsigned seq = 0; + uint64_t pos = 0; + static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; + fprintf(dbg_file, "# seq pos ref-nucleotide frequencies:A C T G\n"); + while (EOF != fscanf(sub_file, "%u %lu\n", &seq, &pos)) + { + assert(seq > 0); + seq--; + assert(pos > 0); + pos--; + for (int inc = -2; inc <= 2; inc++) { + if(inc > 0 && pos + inc >= ref_genome->len_seq[seq]) continue; + if(inc < 0 && pos < abs(inc)) continue; + uint64_t genome_pos = ref_genome->pt_seq[seq] + pos + inc; + if(genome_pos > genome_get()->fasta_file_size) { + printf("WARNING: wrong genome position %lu. seq %u pos %lu inc %d\n", genome_pos, seq, pos, inc); + continue; + } + fprintf(dbg_file, "%u %lu %c %f %f %f %f\n", seq + 1, pos + inc + 1, + nucleotide[ref_genome->data[genome_pos]], + frequency_table[0][genome_pos].freq, frequency_table[1][genome_pos].freq, + frequency_table[2][genome_pos].freq, frequency_table[3][genome_pos].freq); + } + } + fclose(dbg_file); + fclose(sub_file); /* for each sequence in the genome */ for (uint32_t seq_number = 0; seq_number < ref_genome->nb_seq; seq_number++) { /* for each position in the sequence */ for (uint64_t seq_position = 0; seq_position < ref_genome->len_seq[seq_number]; seq_position++) { - uint64_t genome_pos = ref_genome->pt_seq[seq_number] + seq_position; - variant_t ** results = get_most_frequent_variant(ref_genome, frequency_table, genome_pos); + variant_t ** results = get_most_frequent_variant(ref_genome, frequency_table, seq_number, seq_position); int nb_var = 0; for(int i = 0; i < 5; ++i) { variant_t * var = results[i]; @@ -311,8 +384,6 @@ void create_vcf() } } - fclose(dbg_file); - free_frequency_table(); fclose(vcf_file); diff --git a/tests/compareVCF.py b/tests/compareVCF.py index 767b355..779526f 100755 --- a/tests/compareVCF.py +++ b/tests/compareVCF.py @@ -241,7 +241,7 @@ def print_stat(tp_stats, fp_stats, nb_tp, nb_fp, nb_cm): ############################################################################### -def compute_for_pos(v1_infos, v2_infos, tp, fp, tp_stat, fp_stat): +def compute_for_pos(v1_infos, v2_infos, tp, fp, tp_stat, fp_stat, chr, pos, dump): for (ref, alt), v1_info in v1_infos.items(): if (ref, alt) in v2_infos: tp += 1 @@ -249,22 +249,24 @@ def compute_for_pos(v1_infos, v2_infos, tp, fp, tp_stat, fp_stat): else: fp += 1 update_stat(fp_stat, v1_info) + if(dump): + print(chr, pos, ref, "->", alt) return tp, fp -def compute(V1, V2, tp_stat, fp_stat): +def compute(V1, V2, tp_stat, fp_stat, dump): tp = 0 fp = 0 for (chr, pos), v1_infos in V1.items(): - prev_fp = fp if (chr, pos) in V2: tp, fp = compute_for_pos( - v1_infos, V2[(chr, pos)], tp, fp, tp_stat, fp_stat) + v1_infos, V2[(chr, pos)], tp, fp, tp_stat, fp_stat, chr, pos, dump) else: fp += len(v1_infos) update_stat_for_pos(fp_stat, v1_infos) - #if(fp > prev_fp): - # print chr, pos + if(dump): + for (ref, alt), v1_info in v1_infos.items(): + print(chr, pos, ref, "->", alt) return tp, fp @@ -275,13 +277,13 @@ def print_VCF_quality(tp, fp, fn, cm, len_upvc, len_ref): print("cm:\t%.2f%%\t(%d/%d)" % (per(cm, len_ref), cm, len_ref)) -def compute_data(V_ref, V_upvc, len_ref, len_upvc): +def compute_data(V_ref, V_upvc, len_ref, len_upvc, dump=False): stats = {"tp": {}, "fp": {}} if args.enable_stat else None tp_stat = stats["tp"] if args.enable_stat else None fp_stat = stats["fp"] if args.enable_stat else None - tp, fp = compute(V_upvc, V_ref, tp_stat, fp_stat) - cm, fn = compute(V_ref, V_upvc, None, None) + tp, fp = compute(V_upvc, V_ref, tp_stat, fp_stat, False) + cm, fn = compute(V_ref, V_upvc, None, None, dump) # print_stat(tp_stat, fp_stat, tp, fp, cm) @@ -441,7 +443,7 @@ def get_data(filename, extract): args.upvc_file, True) print("\nsubstitution") -compute_data(SUB_ref, SUB_upvc, len_ref_sub, len_upvc_sub) +compute_data(SUB_ref, SUB_upvc, len_ref_sub, len_upvc_sub, True) print("\ninsertions") compute_data(INS_ref, INS_upvc, len_ref_ins, len_upvc_ins) From 0e33e26c12ca8f3c03743e752433e4c1d91f7779 Mon Sep 17 00:00:00 2001 From: Julien Legriel Date: Wed, 31 Mar 2021 17:25:47 +0200 Subject: [PATCH 10/48] Saving current state. Bug fixed for using indels for frequency table. Needs testing on real genome. Needs code refactoring and cleanup. --- common/inc/common.h | 15 +++-- dpu/inc/dout.h | 3 +- dpu/src/dout.c | 3 +- dpu/src/task.c | 5 +- host/src/getread.c | 2 +- host/src/processread.c | 145 ++++++++++++++++++++++++++++++++++++----- host/src/vartree.c | 19 +++--- 7 files changed, 159 insertions(+), 33 deletions(-) diff --git a/common/inc/common.h b/common/inc/common.h index 882efff..98e00b9 100644 --- a/common/inc/common.h +++ b/common/inc/common.h @@ -18,8 +18,8 @@ #define MAX_DPU_RESULTS (1 << 19) #define MAX_RESULTS_PER_READ (1 << 10) -#define SIZE_READ 148 -//#define SIZE_READ 120 +//#define SIZE_READ 148 +#define SIZE_READ 120 #define SIZE_SEED 14 #define SIZE_NEIGHBOUR_IN_BYTES ((SIZE_READ - SIZE_SEED) / 4) #define DELTA_NEIGHBOUR(round) ((SIZE_SEED * round) / 4) @@ -37,13 +37,14 @@ typedef uint32_t delta_info_t; * @brief Coordonates of the read that matched in the reference genome. */ typedef struct { - union { - uint64_t coord; + //union { + //uint64_t coord; struct { uint32_t seed_nr; - uint32_t seq_nr; + uint32_t seq_nr:31; + uint32_t nodp:1; }; - }; + //}; } dpu_result_coord_t; /** @@ -120,4 +121,6 @@ typedef struct { uint8_t nbr[ALIGN_DPU(SIZE_NEIGHBOUR_IN_BYTES)]; } coords_and_nbr_t; +#define USE_INDEL + #endif /* __COMMON_H__ */ diff --git a/dpu/inc/dout.h b/dpu/inc/dout.h index 490cd0d..0095d3a 100644 --- a/dpu/inc/dout.h +++ b/dpu/inc/dout.h @@ -68,8 +68,9 @@ void dout_init(unsigned int tid, dout_t *dout); * @param seed_nr Recorded seed number. * @param seq_nr Recorded sequence number. * @param stats To update statistical report. + * @param nodp True if the result was from nodp, false if from odpd. */ -void dout_add(dout_t *dout, uint32_t num, unsigned int score, uint32_t seed_nr, uint32_t seq_nr, dpu_tasklet_stats_t *stats); +void dout_add(dout_t *dout, uint32_t num, unsigned int score, uint32_t seed_nr, uint32_t seq_nr, dpu_tasklet_stats_t *stats, uint8_t nodp); /** * @brief locates a swap page for a given data out structure. diff --git a/dpu/src/dout.c b/dpu/src/dout.c index 97cdcc7..18cafaa 100644 --- a/dpu/src/dout.c +++ b/dpu/src/dout.c @@ -25,7 +25,7 @@ void dout_init(unsigned int tid, dout_t *dout) dout_clear(dout); } -void dout_add(dout_t *dout, uint32_t num, unsigned int score, uint32_t seed_nr, uint32_t seq_nr, dpu_tasklet_stats_t *stats) +void dout_add(dout_t *dout, uint32_t num, unsigned int score, uint32_t seed_nr, uint32_t seq_nr, dpu_tasklet_stats_t *stats, uint8_t nodp) { dpu_result_out_t *new_out; if (dout->nb_cached_out == MAX_LOCAL_RESULTS_PER_READ) { @@ -49,6 +49,7 @@ void dout_add(dout_t *dout, uint32_t num, unsigned int score, uint32_t seed_nr, new_out->score = score; new_out->coord.seed_nr = seed_nr; new_out->coord.seq_nr = seq_nr; + new_out->coord.nodp = nodp; dout->nb_cached_out++; dout->nb_results++; diff --git a/dpu/src/task.c b/dpu/src/task.c index 18c945a..ffd96a6 100644 --- a/dpu/src/task.c +++ b/dpu/src/task.c @@ -121,6 +121,8 @@ static void compare_neighbours(sysname_t tasklet_id, uint32_t *mini, coords_and_ STATS_STORE_NODP_TIME(tasklet_stats, (end + acc - start)); STATS_INCR_NB_NODP_CALLS(*tasklet_stats); + bool nodp = true; + //TODO uncomment for indel #ifdef USE_INDEL if (score_nodp == UINT_MAX) { @@ -131,6 +133,7 @@ static void compare_neighbours(sysname_t tasklet_id, uint32_t *mini, coords_and_ STATS_GET_END_TIME(end, acc); STATS_STORE_ODPD_TIME(tasklet_stats, (end + acc - start)); STATS_INCR_NB_ODPD_CALLS(*tasklet_stats); + nodp = false; } #endif @@ -151,7 +154,7 @@ static void compare_neighbours(sysname_t tasklet_id, uint32_t *mini, coords_and_ } dout_add(dout, request->num, (unsigned int)score, cached_coords_and_nbr->coord.seed_nr, cached_coords_and_nbr->coord.seq_nr, - tasklet_stats); + tasklet_stats, nodp); } static void compute_request(sysname_t tasklet_id, coords_and_nbr_t *cached_coords_and_nbr, uint8_t *current_read_nbr, diff --git a/host/src/getread.c b/host/src/getread.c index 943a320..7f5572a 100644 --- a/host/src/getread.c +++ b/host/src/getread.c @@ -98,7 +98,7 @@ void get_reads(FILE *fpe1, FILE *fpe2, unsigned int pass_id) for(uint8_t i = 33; i < 76; ++i) { quality_lookup_table[i - 33] = MAX(0.001f, 1.0f - pow(10.0f, -(i-33)/10.0f)); - printf("%d %f\n", i, quality_lookup_table[i - 33]); + /*printf("%d %f\n", i, quality_lookup_table[i - 33]);*/ } lookup = true; } diff --git a/host/src/processread.c b/host/src/processread.c index d8d7e49..4d96829 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -204,7 +204,7 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy * The code is return in "code" as a table of int8_t */ -#if 0 +#ifdef USE_INDEL static int code_alignment(uint8_t *code, int score, int8_t *gen, int8_t *read, unsigned size_neighbour_in_symbols) { int code_idx, computed_score, backtrack_idx; @@ -277,7 +277,9 @@ static int code_alignment(uint8_t *code, int score, int8_t *gen, int8_t *read, u code[code_idx++] = CODE_END; return code_idx; } +#endif +#if 0 static void set_variant( dpu_result_out_t result_match, genome_t *ref_genome, int8_t *reads_buffer, unsigned int size_neighbour_in_symbols) { @@ -410,14 +412,16 @@ static void add_to_non_mapped_read(int numread, int round, FILE *fpe1, FILE *fpe fprintf(fpe2, "\n"); pthread_mutex_unlock(&non_mapped_mutex); } +static uint64_t freq_update_count = 0; -static void update_frequency_table( +bool update_frequency_table( genome_t *ref_genome, dpu_result_out_t *result_tab, int8_t *reads_buffer, float *reads_quality_buffer, int pos, - float mapq) { + float mapq, + __attribute__((unused))int size_neighbour_in_symbols) { struct frequency_info **frequency_table = get_frequency_table(); uint64_t genome_pos = ref_genome->pt_seq[result_tab[pos].coord.seq_nr] + result_tab[pos].coord.seed_nr; @@ -427,14 +431,116 @@ static void update_frequency_table( //TODO: read the quality in the correct order (inverted or not) bool inv = num & 1; //TODO: assume no offset here + +#ifdef USE_INDEL + uint8_t code_result_tab[256]; + /*for(int i = 0; i < 256; ++i) code_result_tab[i] = 0;*/ + code_alignment(code_result_tab, result_tab[pos].score, &ref_genome->data[genome_pos], read, size_neighbour_in_symbols); + if (code_result_tab[0] != CODE_ERR) { + + uint64_t update_genome_position[SIZE_READ]; + int64_t curr_pos = genome_pos; + int i = 0; + int j = 0; + bool hasIndel = false; + while (code_result_tab[i] != CODE_END) { + int code_result = code_result_tab[i]; + int64_t pos_variant_read = code_result_tab[i + 1]; + /*int64_t pos_variant_genome = genome_pos + pos_variant_read;*/ + if (code_result == CODE_SUB) { + /* SNP = 0,1,2,3 (code A,C,T,G) */ + /*int snp = code_result_tab[i + 2];*/ + /*frequency_table[snp & 3][pos_variant_genome].freq += mapq * read_quality[inv ? SIZE_READ - pos_variant_read - 1 : pos_variant_read];*/ + /*frequency_table[snp & 3][pos_variant_genome].score += result_tab[pos].score;*/ + i += 3; + } + else if (code_result == CODE_INS) { + while(j <= pos_variant_read) { + update_genome_position[j++] = curr_pos++; + } + // insertion, skip these positions in the read + i += 2; + while (code_result_tab[i] < 4) { + i++; + update_genome_position[j++] = UINT64_MAX; + } + printf("Insertion pos %lu\n", pos_variant_read); + hasIndel = true; + } + else if (code_result == CODE_DEL) { + while(j < pos_variant_read) { + update_genome_position[j++] = curr_pos++; + } + // deletion, skip these positions in the reference genome + i += 2; + while (code_result_tab[i] < 4) { + i++; + curr_pos++; + } + printf("Deletion pos %lu\n", pos_variant_read); + hasIndel = true; + } + else + assert(0); + } + /*printf("j:%u\n", j);*/ + while(j < SIZE_READ) + update_genome_position[j++] = curr_pos++; + if(hasIndel) { + static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; + printf("Read:\n"); + for(int k = 0; k < SIZE_READ; ++k) { + printf("%c", nucleotide[read[k]]); + } + printf("\ngenome:\n"); + for(uint64_t k = genome_pos; k < genome_pos + SIZE_READ; ++k) { + printf("%c", nucleotide[ref_genome->data[k]]); + } + printf("\nupdate pos:\n"); + uint64_t lastpos = 0; + for(uint64_t k = 0; k < SIZE_READ; ++k) { + if(k && update_genome_position[k] != lastpos+1) { + if(update_genome_position[k] == UINT64_MAX) + printf("No update at position %lu\n", k); + else if(lastpos == UINT64_MAX) + printf("New start at pos %lu = %lu / %c\n", k, update_genome_position[k], nucleotide[ref_genome->data[update_genome_position[k]]]); + else + printf("Change at pos %lu, diff %ld, %c\n", k, update_genome_position[k] - lastpos, nucleotide[ref_genome->data[update_genome_position[k]]]); + } + lastpos = update_genome_position[k]; + } + printf("\n\n"); + } + + for(uint64_t k = 0; k < SIZE_READ; ++k) { + uint64_t update_genome_pos = update_genome_position[k]; + if(update_genome_pos < genome_get()->fasta_file_size) { + frequency_table[read[k]][update_genome_pos].freq += mapq * read_quality[inv ? SIZE_READ - k - 1 : k]; + /*frequency_table[read[j]][genome_pos+j].score += result_tab[pos].score;*/ + frequency_table[read[k]][update_genome_pos].score++; + } + else if (update_genome_pos != UINT64_MAX) + printf("WARNING: genome update position computed is wrong %lu\n", update_genome_pos); + } + return hasIndel; + } + else + assert(0); + + return false; + +#else for(int j = 0; j < SIZE_READ; ++j) { if(genome_pos + j < genome_get()->fasta_file_size) { frequency_table[read[j]][genome_pos+j].freq += mapq * read_quality[inv ? SIZE_READ - j - 1 : j]; - frequency_table[read[j]][genome_pos+j].score += result_tab[pos].score; + /*frequency_table[read[j]][genome_pos+j].score += result_tab[pos].score;*/ + frequency_table[read[j]][genome_pos+j].score ++; } else printf("WARNING: reads matched at position that exceeds genome size\n"); } + return false; +#endif } static volatile unsigned int curr_match; @@ -506,7 +612,7 @@ static void do_process_read(process_read_arg_t *arg) genome_t *ref_genome = arg->ref_genome; FILE *fpe1 = arg->fpe1; FILE *fpe2 = arg->fpe2; - //unsigned int size_neighbour_in_symbols = (SIZE_NEIGHBOUR_IN_BYTES - DELTA_NEIGHBOUR(round)) * 4; + unsigned int size_neighbour_in_symbols = (SIZE_NEIGHBOUR_IN_BYTES - DELTA_NEIGHBOUR(round)) * 4; /* * The number of a pair is given by "num_read / 4 " (see dispatch_read function) @@ -522,6 +628,7 @@ static void do_process_read(process_read_arg_t *arg) * - when different position mapping are possible, choose the less covered zone */ + printf("freq_update_count: %lu\n", freq_update_count); while (true) { unsigned int i; if ((i = acquire_curr_match()) >= nb_match) { @@ -590,8 +697,16 @@ static void do_process_read(process_read_arg_t *arg) #else if(delta > DIST_PAIR_THRESHOLD) { #endif - update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); - update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); + // we have matched PE1 with indel (odpd) and PE2 with substition only + // Because of indel the delta threshold is passed + // PE1 does not contribute to frequency (has indel), but PE2 will => does not seem to be the reason + // + // Other possibility: DPU returns results from odpd, and we dont detect indel in those + // + // Other possibility: results from odpd implies that we select more reads due to distance threshold. + // Normally not possible because it can only improve the second best score and hence reduce the delta + update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); + update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); update = true; } } @@ -608,12 +723,12 @@ static void do_process_read(process_read_arg_t *arg) #else if(delta > DIST_PAIR_THRESHOLD) { #endif - update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); - update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); + update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); + update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); update = true; } - //update_frequency_table(ref_genome, result_tab, reads_buffer, P1[0]); - //update_frequency_table(ref_genome, result_tab, reads_buffer, P2[0]); + //update_frequency_table(ref_genome, result_tab, reads_buffer, P1[0], size_neighbour_in_symbols); + //update_frequency_table(ref_genome, result_tab, reads_buffer, P2[0], size_neighbour_in_symbols); //update = true; } } @@ -657,7 +772,7 @@ static void do_process_read(process_read_arg_t *arg) #else if(delta > DIST_SINGLE_THRESHOLD) { #endif - update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); + update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); update = true; } } @@ -676,7 +791,7 @@ static void do_process_read(process_read_arg_t *arg) #else if(delta > DIST_SINGLE_THRESHOLD) { #endif - update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); + update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); update = true; } } @@ -699,7 +814,7 @@ static void do_process_read(process_read_arg_t *arg) if(delta > DIST_SINGLE_THRESHOLD) { #endif - update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); + update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); update = true; } } @@ -718,7 +833,7 @@ static void do_process_read(process_read_arg_t *arg) #else if (delta > DIST_SINGLE_THRESHOLD) { #endif - update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); + update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); update = true; } } diff --git a/host/src/vartree.c b/host/src/vartree.c index c87b06f..5448dde 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -219,7 +219,7 @@ great that you have implemented the quality score weights in the frequency table * * */ -__attribute__((unused)) int32_t depth_filter1(float freq) { +__attribute__((unused)) uint32_t depth_filter1(float freq) { if(freq < 10.0f) return UINT_MAX; if(freq < 15.0f) @@ -229,9 +229,9 @@ __attribute__((unused)) int32_t depth_filter1(float freq) { if(freq < 25.0f) return 4; return 5; -}; +} -__attribute__((unused)) int32_t depth_filter2(float freq) { +__attribute__((unused)) uint32_t depth_filter2(float freq) { if(freq < 10.0f) return UINT_MAX; if(freq < 15.0f) @@ -243,9 +243,9 @@ __attribute__((unused)) int32_t depth_filter2(float freq) { if(freq < 30.0f) return 3; return 2; -}; +} -__attribute__((unused)) int32_t depth_filter3(float freq) { +__attribute__((unused)) uint32_t depth_filter3(float freq) { if(freq < 10.0f) return UINT_MAX; if(freq < 15.0f) @@ -253,9 +253,9 @@ __attribute__((unused)) int32_t depth_filter3(float freq) { if(freq < 20.0f) return 4; return 3; -}; +} -#define depth_filter depth_filter1 +#define depth_filter depth_filter3 FILE * dbg_file = NULL; FILE * sub_file = NULL; @@ -271,12 +271,15 @@ static variant_t ** get_most_frequent_variant(genome_t * ref_genome, struct freq for(int i = 0; i < 5; ++i) { total += frequency_table[i][genome_pos].freq; } + if(total == 0) + return results; for(int i = 0; i < 5; ++i) { float freq = frequency_table[i][genome_pos].freq; + uint32_t score = frequency_table[i][genome_pos].score; if(i == ref_genome->data[genome_pos]) continue; // not a variant if the same nucleotide as in reference genome if((freq / total > FREQUENCY_THRESHOLD) - && freq > depth_filter(freq)) { // if frequency and depth pass the threshold, consider it a variant + && score >= depth_filter(freq * 100.0 / total)) { // if frequency and depth pass the threshold, consider it a variant // this is a substitution, create variant variant_t *var = (variant_t *)malloc(sizeof(variant_t)); From 0182090bef36b66f27506e10e860a198a574b97c Mon Sep 17 00:00:00 2001 From: Julien Legriel Date: Wed, 14 Apr 2021 08:31:41 +0200 Subject: [PATCH 11/48] Several changes/refactoring. Regression on chr3 need to debug --- common/inc/common.h | 18 ++- host/src/processread.c | 294 ++++++++++++++++++++++------------------- host/src/vartree.c | 49 +++++-- tests/compareVCF.py | 4 +- 4 files changed, 213 insertions(+), 152 deletions(-) diff --git a/common/inc/common.h b/common/inc/common.h index 98e00b9..8d5c1a0 100644 --- a/common/inc/common.h +++ b/common/inc/common.h @@ -18,7 +18,7 @@ #define MAX_DPU_RESULTS (1 << 19) #define MAX_RESULTS_PER_READ (1 << 10) -//#define SIZE_READ 148 +//#define SIZE_READ 149 #define SIZE_READ 120 #define SIZE_SEED 14 #define SIZE_NEIGHBOUR_IN_BYTES ((SIZE_READ - SIZE_SEED) / 4) @@ -121,6 +121,20 @@ typedef struct { uint8_t nbr[ALIGN_DPU(SIZE_NEIGHBOUR_IN_BYTES)]; } coords_and_nbr_t; -#define USE_INDEL + +// configuration for variant calling using frequency table + +// to activate use of reads with indels +//#define USE_INDEL +// to activate use of mapq score +//#define USE_MAPQ_SCORE + +// various parameters/thresholds +#define DIST_PAIR_THRESHOLD 1 +#define DIST_SINGLE_THRESHOLD 0 +#define MAPQ_SCALING_FACTOR 2 +#define READ_DIST_LOWER_BOUND 50 +#define READ_DIST_UPPER_BOUND 2000 +#define depth_filter depth_filter_fixed_3 #endif /* __COMMON_H__ */ diff --git a/host/src/processread.c b/host/src/processread.c index 4d96829..41c3153 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -412,102 +412,131 @@ static void add_to_non_mapped_read(int numread, int round, FILE *fpe1, FILE *fpe fprintf(fpe2, "\n"); pthread_mutex_unlock(&non_mapped_mutex); } -static uint64_t freq_update_count = 0; +#ifdef USE_INDEL +bool get_read_update_positions( + uint64_t * update_genome_position, + dpu_result_out_t *result_tab, + int pos, + genome_t *ref_genome, + uint64_t genome_pos, + int8_t *read, + __attribute__((unused))int size_neighbour_in_symbols) { + + // run smith and waterman algorithm to find indels + uint8_t code_result_tab[256]; + code_alignment(code_result_tab, result_tab[pos].score, &ref_genome->data[genome_pos], read, size_neighbour_in_symbols); + if (code_result_tab[0] != CODE_ERR) { + + // array that will contain for each read position, the genome position that it matches too + // This is the genome position that will be updated in the frequency table + // This genome position takes into account the shift due to possible indels found + // with smith-waterman algorithm + int64_t curr_pos = genome_pos; + int code_result_index = 0; + int read_pos = 0; + bool hasIndel = false; + while (code_result_tab[code_result_index] != CODE_END) { + int code_result = code_result_tab[code_result_index]; + int64_t pos_variant_read = code_result_tab[code_result_index + 1]; + if (code_result == CODE_SUB) { + // do nothing for substitution + code_result_index += 3; + } + else if (code_result == CODE_INS) { + while(read_pos <= pos_variant_read) { + update_genome_position[read_pos++] = curr_pos++; + } + // insertion, skip these positions in the read + code_result_index += 2; + while (code_result_tab[code_result_index] < 4) { + code_result_index++; + update_genome_position[read_pos++] = UINT64_MAX; + } + /*printf("Insertion pos %lu\n", pos_variant_read);*/ + hasIndel = true; + } + else if (code_result == CODE_DEL) { + while(read_pos < pos_variant_read) { + update_genome_position[read_pos++] = curr_pos++; + } + // deletion, skip these positions in the reference genome + code_result_index += 2; + while (code_result_tab[code_result_index] < 4) { + code_result_index++; + curr_pos++; + } + /*printf("Deletion pos %lu\n", pos_variant_read);*/ + hasIndel = true; + } + else + assert(0); + } + while(read_pos < SIZE_READ) + update_genome_position[read_pos++] = curr_pos++; + + return hasIndel; + } + else + assert(0); + + return false; +} +#endif + + +static pthread_mutex_t freq_table_mutex; + +/** + * function to update frequency table used for variant calling + **/ bool update_frequency_table( - genome_t *ref_genome, - dpu_result_out_t *result_tab, - int8_t *reads_buffer, - float *reads_quality_buffer, - int pos, - float mapq, - __attribute__((unused))int size_neighbour_in_symbols) { - - struct frequency_info **frequency_table = get_frequency_table(); - uint64_t genome_pos = ref_genome->pt_seq[result_tab[pos].coord.seq_nr] + result_tab[pos].coord.seed_nr; - int num = result_tab[pos].num; - int8_t *read = reads_buffer + (num * SIZE_READ); - float *read_quality = reads_quality_buffer + (num/2 * SIZE_READ); - //TODO: read the quality in the correct order (inverted or not) - bool inv = num & 1; - //TODO: assume no offset here + genome_t *ref_genome, + dpu_result_out_t *result_tab, + int8_t *reads_buffer, + float *reads_quality_buffer, + int pos, + float mapq, + __attribute__((unused))int size_neighbour_in_symbols) { + + struct frequency_info **frequency_table = get_frequency_table(); + uint64_t genome_pos = ref_genome->pt_seq[result_tab[pos].coord.seq_nr] + result_tab[pos].coord.seed_nr; + int num = result_tab[pos].num; + int8_t *read = reads_buffer + (num * SIZE_READ); + float *read_quality = reads_quality_buffer + (num/2 * SIZE_READ); + //TODO: read the quality in the correct order (inverted or not) + bool inv = num & 1; + //TODO: assume no offset here #ifdef USE_INDEL - uint8_t code_result_tab[256]; - /*for(int i = 0; i < 256; ++i) code_result_tab[i] = 0;*/ - code_alignment(code_result_tab, result_tab[pos].score, &ref_genome->data[genome_pos], read, size_neighbour_in_symbols); - if (code_result_tab[0] != CODE_ERR) { + /*pthread_mutex_lock(&freq_table_mutex);*/ uint64_t update_genome_position[SIZE_READ]; - int64_t curr_pos = genome_pos; - int i = 0; - int j = 0; - bool hasIndel = false; - while (code_result_tab[i] != CODE_END) { - int code_result = code_result_tab[i]; - int64_t pos_variant_read = code_result_tab[i + 1]; - /*int64_t pos_variant_genome = genome_pos + pos_variant_read;*/ - if (code_result == CODE_SUB) { - /* SNP = 0,1,2,3 (code A,C,T,G) */ - /*int snp = code_result_tab[i + 2];*/ - /*frequency_table[snp & 3][pos_variant_genome].freq += mapq * read_quality[inv ? SIZE_READ - pos_variant_read - 1 : pos_variant_read];*/ - /*frequency_table[snp & 3][pos_variant_genome].score += result_tab[pos].score;*/ - i += 3; - } - else if (code_result == CODE_INS) { - while(j <= pos_variant_read) { - update_genome_position[j++] = curr_pos++; - } - // insertion, skip these positions in the read - i += 2; - while (code_result_tab[i] < 4) { - i++; - update_genome_position[j++] = UINT64_MAX; - } - printf("Insertion pos %lu\n", pos_variant_read); - hasIndel = true; - } - else if (code_result == CODE_DEL) { - while(j < pos_variant_read) { - update_genome_position[j++] = curr_pos++; - } - // deletion, skip these positions in the reference genome - i += 2; - while (code_result_tab[i] < 4) { - i++; - curr_pos++; - } - printf("Deletion pos %lu\n", pos_variant_read); - hasIndel = true; + bool hasIndel = get_read_update_positions(update_genome_position, result_tab, pos, + ref_genome, genome_pos, read, size_neighbour_in_symbols); + + if(hasIndel && false) { + static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; + printf("Read:\n"); + for(int k = 0; k < SIZE_READ; ++k) { + printf("%c", nucleotide[read[k]]); } - else - assert(0); - } - /*printf("j:%u\n", j);*/ - while(j < SIZE_READ) - update_genome_position[j++] = curr_pos++; - if(hasIndel) { - static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; - printf("Read:\n"); - for(int k = 0; k < SIZE_READ; ++k) { - printf("%c", nucleotide[read[k]]); - } - printf("\ngenome:\n"); - for(uint64_t k = genome_pos; k < genome_pos + SIZE_READ; ++k) { - printf("%c", nucleotide[ref_genome->data[k]]); + printf("\ngenome:\n"); + for(uint64_t k = genome_pos; k < genome_pos + SIZE_READ; ++k) { + printf("%c", nucleotide[ref_genome->data[k]]); } printf("\nupdate pos:\n"); uint64_t lastpos = 0; for(uint64_t k = 0; k < SIZE_READ; ++k) { - if(k && update_genome_position[k] != lastpos+1) { - if(update_genome_position[k] == UINT64_MAX) - printf("No update at position %lu\n", k); - else if(lastpos == UINT64_MAX) - printf("New start at pos %lu = %lu / %c\n", k, update_genome_position[k], nucleotide[ref_genome->data[update_genome_position[k]]]); - else - printf("Change at pos %lu, diff %ld, %c\n", k, update_genome_position[k] - lastpos, nucleotide[ref_genome->data[update_genome_position[k]]]); - } - lastpos = update_genome_position[k]; + if(k && update_genome_position[k] != lastpos+1) { + if(update_genome_position[k] == UINT64_MAX) + printf("No update at position %lu\n", k); + else if(lastpos == UINT64_MAX) + printf("New start at pos %lu = %lu / %c\n", k, update_genome_position[k], nucleotide[ref_genome->data[update_genome_position[k]]]); + else + printf("Change at pos %lu, diff %ld, %c\n", k, update_genome_position[k] - lastpos, nucleotide[ref_genome->data[update_genome_position[k]]]); + } + lastpos = update_genome_position[k]; } printf("\n\n"); } @@ -522,24 +551,21 @@ bool update_frequency_table( else if (update_genome_pos != UINT64_MAX) printf("WARNING: genome update position computed is wrong %lu\n", update_genome_pos); } + /*pthread_mutex_unlock(&freq_table_mutex);*/ return hasIndel; - } - else - assert(0); - - return false; #else - for(int j = 0; j < SIZE_READ; ++j) { - if(genome_pos + j < genome_get()->fasta_file_size) { - frequency_table[read[j]][genome_pos+j].freq += mapq * read_quality[inv ? SIZE_READ - j - 1 : j]; - /*frequency_table[read[j]][genome_pos+j].score += result_tab[pos].score;*/ - frequency_table[read[j]][genome_pos+j].score ++; - } + pthread_mutex_lock(&freq_table_mutex); + for(int j = 0; j < SIZE_READ; ++j) { + if(genome_pos + j < genome_get()->fasta_file_size) { + frequency_table[read[j]][genome_pos+j].freq += mapq * read_quality[inv ? SIZE_READ - j - 1 : j]; + frequency_table[read[j]][genome_pos+j].score++; + } else printf("WARNING: reads matched at position that exceeds genome size\n"); } - return false; + pthread_mutex_unlock(&freq_table_mutex); + return false; #endif } @@ -572,12 +598,11 @@ typedef struct { static uint64_t nr_reads_total = 0ULL; static uint64_t nr_reads_non_mapped = 0ULL; +static uint64_t nr_reads_with_indels = 0ULL; static pthread_mutex_t nr_reads_mutex = PTHREAD_MUTEX_INITIALIZER; #define MISMATCH_COUNT(X) (X.score / 10) -#define DIST_PAIR_THRESHOLD 1 -#define DIST_SINGLE_THRESHOLD 0 -#define MAPQ_SCALING_FACTOR 2 +#define INVALID_SCORE 1000 static void keep_best_2_scores(unsigned score, unsigned* P1, unsigned *P2, unsigned x1, unsigned x2, unsigned* best_score) { @@ -601,6 +626,16 @@ static void keep_best_2_scores(unsigned score, unsigned* P1, unsigned *P2, unsig } } +static unsigned get_nb_scores(unsigned int * best_score) { + + unsigned np = 0; + if(best_score[0] < INVALID_SCORE) { + np++; + if(best_score[1] < INVALID_SCORE) np++; + } + return np; +} + /*#define USE_MAPQ_SCORE*/ static void do_process_read(process_read_arg_t *arg) { @@ -628,7 +663,6 @@ static void do_process_read(process_read_arg_t *arg) * - when different position mapping are possible, choose the less covered zone */ - printf("freq_update_count: %lu\n", freq_update_count); while (true) { unsigned int i; if ((i = acquire_curr_match()) >= nb_match) { @@ -647,9 +681,6 @@ static void do_process_read(process_read_arg_t *arg) // select best couples of paired reads unsigned int P1[2]; unsigned int P2[2]; - //unsigned int P1_all[1000]; - //unsigned int P2_all[1000]; - //unsigned int np_all = 0; unsigned int pos1, pos2, t1, t2; unsigned int best_score[2] = { 1000, 1000 }; /*unsigned int best_score_all = 1000;*/ @@ -662,7 +693,7 @@ static void do_process_read(process_read_arg_t *arg) t2 = result_tab[x2].num % 4; if (t1 + t2 == 3) // select significant pair { - if ((abs((int)pos2 - (int)pos1) > 50 && (abs((int)pos2 - (int)pos1) < 2000))) { + if ((abs((int)pos2 - (int)pos1) > READ_DIST_LOWER_BOUND && (abs((int)pos2 - (int)pos1) < READ_DIST_UPPER_BOUND))) { // update if this is one of the two best scores keep_best_2_scores(result_tab[x1].score + result_tab[x2].score, P1, P2, x1, x2, best_score); } @@ -671,11 +702,8 @@ static void do_process_read(process_read_arg_t *arg) } bool update = false; - unsigned np = 0; - if(best_score[0] < 1000) { - np++; - if(best_score[1] < 1000) np++; - } + bool hasIndel = false; + unsigned np = get_nb_scores(best_score); if (np > 0) { if(np == 2) { @@ -697,16 +725,8 @@ static void do_process_read(process_read_arg_t *arg) #else if(delta > DIST_PAIR_THRESHOLD) { #endif - // we have matched PE1 with indel (odpd) and PE2 with substition only - // Because of indel the delta threshold is passed - // PE1 does not contribute to frequency (has indel), but PE2 will => does not seem to be the reason - // - // Other possibility: DPU returns results from odpd, and we dont detect indel in those - // - // Other possibility: results from odpd implies that we select more reads due to distance threshold. - // Normally not possible because it can only improve the second best score and hence reduce the delta - update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); - update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); update = true; } } @@ -723,13 +743,10 @@ static void do_process_read(process_read_arg_t *arg) #else if(delta > DIST_PAIR_THRESHOLD) { #endif - update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); - update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); update = true; } - //update_frequency_table(ref_genome, result_tab, reads_buffer, P1[0], size_neighbour_in_symbols); - //update_frequency_table(ref_genome, result_tab, reads_buffer, P2[0], size_neighbour_in_symbols); - //update = true; } } if(true) { @@ -751,11 +768,7 @@ static void do_process_read(process_read_arg_t *arg) } } - unsigned np1 = 0, np2 = 0; - if(best_score_R1[0] < 1000) np1++; - if(best_score_R1[1] < 1000) np1++; - if(best_score_R2[0] < 1000) np2++; - if(best_score_R2[1] < 1000) np2++; + unsigned np1 = get_nb_scores(best_score_R1), np2 = get_nb_scores(best_score_R2); if(np1 == 2) { int delta = abs(MISMATCH_COUNT(result_tab[P1[0]]) - MISMATCH_COUNT(result_tab[P1[1]])); @@ -772,7 +785,7 @@ static void do_process_read(process_read_arg_t *arg) #else if(delta > DIST_SINGLE_THRESHOLD) { #endif - update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); update = true; } } @@ -791,7 +804,7 @@ static void do_process_read(process_read_arg_t *arg) #else if(delta > DIST_SINGLE_THRESHOLD) { #endif - update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); update = true; } } @@ -814,7 +827,7 @@ static void do_process_read(process_read_arg_t *arg) if(delta > DIST_SINGLE_THRESHOLD) { #endif - update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); update = true; } } @@ -833,7 +846,7 @@ static void do_process_read(process_read_arg_t *arg) #else if (delta > DIST_SINGLE_THRESHOLD) { #endif - update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); update = true; } } @@ -843,9 +856,11 @@ static void do_process_read(process_read_arg_t *arg) pthread_mutex_unlock(&nr_reads_mutex); add_to_non_mapped_read(numpair * 4, round, fpe1, fpe2, reads_buffer); } - pthread_mutex_lock(&nr_reads_mutex); - nr_reads_total++; - pthread_mutex_unlock(&nr_reads_mutex); + if(hasIndel) + nr_reads_with_indels++; + /*pthread_mutex_lock(&nr_reads_mutex);*/ + /*nr_reads_total++;*/ + /*pthread_mutex_unlock(&nr_reads_mutex);*/ } } } @@ -861,6 +876,7 @@ void process_read(FILE *fpe1, FILE *fpe2, int round, unsigned int pass_id) int8_t *reads_buffer = get_reads_buffer(pass_id); float *reads_quality_buffer = get_reads_quality_buffer(pass_id); acc_results_t acc_res = accumulate_get_result(pass_id); + nr_reads_total += get_reads_in_buffer(pass_id); curr_match = 0; @@ -897,6 +913,7 @@ void process_read_init() assert(pthread_mutex_init(&curr_match_mutex, NULL) == 0); assert(pthread_mutex_init(&non_mapped_mutex, NULL) == 0); + assert(pthread_mutex_init(&freq_table_mutex, NULL) == 0); assert(pthread_barrier_init(&barrier, NULL, PROCESS_READ_THREAD) == 0); for (unsigned int each_thread = 0; each_thread < PROCESS_READ_THREAD_SLAVE; each_thread++) { @@ -916,5 +933,8 @@ void process_read_free() assert(pthread_barrier_destroy(&barrier) == 0); assert(pthread_mutex_destroy(&curr_match_mutex) == 0); assert(pthread_mutex_destroy(&non_mapped_mutex) == 0); + assert(pthread_mutex_destroy(&freq_table_mutex) == 0); fprintf(stderr, "%% reads non mapped: %f%%\n", (float)nr_reads_non_mapped * 100.0 / (float)nr_reads_total); + fprintf(stderr, "%% reads with indels: %f%%\n", (float)nr_reads_with_indels * 100.0 / (float)(nr_reads_total - nr_reads_non_mapped)); + fprintf(stderr, "%% Total reads: %ld%%\n", nr_reads_total); } diff --git a/host/src/vartree.c b/host/src/vartree.c index 5448dde..6e7a3eb 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -208,8 +208,6 @@ static bool print_variant_tree(variant_t *var, uint32_t seq_nr, uint64_t seq_pos return true; } -#define FREQUENCY_THRESHOLD 0.20 -#define DEPTH_THRESHOLD 3 /* great that you have implemented the quality score weights in the frequency table. I think using D=1 is not advisable since the number of false positives becomes extremely high, also with D=2 we need to be very careful. Thus, I would recommend to test the accuracy using Q-scores with our previously used parameters first (e.g. D>=3 and 20%, and the ones I have suggested in (1) (i, ii, and iii) and compare them to our old results. @@ -228,7 +226,7 @@ __attribute__((unused)) uint32_t depth_filter1(float freq) { return 5; if(freq < 25.0f) return 4; - return 5; + return 3; } __attribute__((unused)) uint32_t depth_filter2(float freq) { @@ -255,7 +253,27 @@ __attribute__((unused)) uint32_t depth_filter3(float freq) { return 3; } -#define depth_filter depth_filter3 +__attribute__((unused)) uint32_t depth_filter_a(float freq) { + if(freq >= 20) + return 3; + if(freq >= 15) + return 5; + return UINT_MAX; +} + +__attribute__((unused)) uint32_t depth_filter_fixed_3(float freq) { + + if(freq < 20.0f) + return UINT_MAX; + return 3; +} + +__attribute__((unused)) uint32_t depth_filter_fixed_3_f15(float freq) { + + if(freq < 15.0f) + return UINT_MAX; + return 3; +} FILE * dbg_file = NULL; FILE * sub_file = NULL; @@ -278,13 +296,15 @@ static variant_t ** get_most_frequent_variant(genome_t * ref_genome, struct freq float freq = frequency_table[i][genome_pos].freq; uint32_t score = frequency_table[i][genome_pos].score; if(i == ref_genome->data[genome_pos]) continue; // not a variant if the same nucleotide as in reference genome - if((freq / total > FREQUENCY_THRESHOLD) - && score >= depth_filter(freq * 100.0 / total)) { // if frequency and depth pass the threshold, consider it a variant + /*if((freq / total > FREQUENCY_THRESHOLD) */ + if(score >= depth_filter(freq * 100.0 / total)) { // if frequency and depth pass the threshold, consider it a variant + /*printf("variant depth %u freq %f threshold %u\n", score, freq,*/ + /*depth_filter(freq * 100.0 / total));*/ // this is a substitution, create variant variant_t *var = (variant_t *)malloc(sizeof(variant_t)); var->score = frequency_table[i][genome_pos].score; - var->depth = frequency_table[i][genome_pos].freq; + var->depth = frequency_table[i][genome_pos].score; var->ref[0] = nucleotide[ref_genome->data[genome_pos]]; var->ref[1] = '\0'; var->alt[0] = nucleotide[i]; @@ -357,10 +377,17 @@ void create_vcf() printf("WARNING: wrong genome position %lu. seq %u pos %lu inc %d\n", genome_pos, seq, pos, inc); continue; } - fprintf(dbg_file, "%u %lu %c %f %f %f %f\n", seq + 1, pos + inc + 1, - nucleotide[ref_genome->data[genome_pos]], - frequency_table[0][genome_pos].freq, frequency_table[1][genome_pos].freq, - frequency_table[2][genome_pos].freq, frequency_table[3][genome_pos].freq); + fprintf(dbg_file, "%u %lu %c ", seq + 1, pos + inc + 1, nucleotide[ref_genome->data[genome_pos]]); + float total = 0.0f; + for(int m = 0; m < 5; ++m) { + total += frequency_table[m][genome_pos].freq; + } + for(int m = 0; m < 5; ++m) { + fprintf(dbg_file, "(%f %u %f %u) ", + frequency_table[m][genome_pos].freq, frequency_table[m][genome_pos].score, + frequency_table[m][genome_pos].freq * 100.0 / total, depth_filter(frequency_table[m][genome_pos].freq * 100.0 / total)); + } + fprintf(dbg_file, "\n"); } } fclose(dbg_file); diff --git a/tests/compareVCF.py b/tests/compareVCF.py index 779526f..cea6338 100755 --- a/tests/compareVCF.py +++ b/tests/compareVCF.py @@ -282,8 +282,8 @@ def compute_data(V_ref, V_upvc, len_ref, len_upvc, dump=False): tp_stat = stats["tp"] if args.enable_stat else None fp_stat = stats["fp"] if args.enable_stat else None - tp, fp = compute(V_upvc, V_ref, tp_stat, fp_stat, False) - cm, fn = compute(V_ref, V_upvc, None, None, dump) + tp, fp = compute(V_upvc, V_ref, tp_stat, fp_stat, dump) + cm, fn = compute(V_ref, V_upvc, None, None, False) # print_stat(tp_stat, fp_stat, tp, fp, cm) From fcade21d2f70568dc0bafaec3b5e236c4fa00ef2 Mon Sep 17 00:00:00 2001 From: Julien Legriel Date: Thu, 20 May 2021 11:26:08 +0200 Subject: [PATCH 12/48] Saving current dev status. Workaround for SW algo issue. Going to integrate Dominique's fix --- common/inc/common.h | 8 +-- host/src/processread.c | 137 +++++++++++++++++++++++++++++++---------- host/src/vartree.c | 3 +- tests/compareVCF.py | 4 +- 4 files changed, 111 insertions(+), 41 deletions(-) diff --git a/common/inc/common.h b/common/inc/common.h index 8d5c1a0..8b4f42a 100644 --- a/common/inc/common.h +++ b/common/inc/common.h @@ -18,8 +18,8 @@ #define MAX_DPU_RESULTS (1 << 19) #define MAX_RESULTS_PER_READ (1 << 10) -//#define SIZE_READ 149 -#define SIZE_READ 120 +#define SIZE_READ 148 +//#define SIZE_READ 120 #define SIZE_SEED 14 #define SIZE_NEIGHBOUR_IN_BYTES ((SIZE_READ - SIZE_SEED) / 4) #define DELTA_NEIGHBOUR(round) ((SIZE_SEED * round) / 4) @@ -125,7 +125,7 @@ typedef struct { // configuration for variant calling using frequency table // to activate use of reads with indels -//#define USE_INDEL +#define USE_INDEL // to activate use of mapq score //#define USE_MAPQ_SCORE @@ -135,6 +135,6 @@ typedef struct { #define MAPQ_SCALING_FACTOR 2 #define READ_DIST_LOWER_BOUND 50 #define READ_DIST_UPPER_BOUND 2000 -#define depth_filter depth_filter_fixed_3 +#define depth_filter depth_filter3 #endif /* __COMMON_H__ */ diff --git a/host/src/processread.c b/host/src/processread.c index 41c3153..78fcca0 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -37,6 +37,8 @@ #define PATH_DELETION (2) #define MAX_SUBSTITUTION (4) +static bool flag_dbg = false; + typedef struct { int type; int ix; @@ -169,12 +171,14 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy } else { if (path[i][j] == PATH_INSERTION) { j--; + flag_dbg = true; backtrack[align_distance].type = CODE_INS; backtrack[align_distance].ix = i; backtrack[align_distance].jx = j; align_distance++; } else if (path[i][j] == PATH_DELETION) { i--; + flag_dbg = true; backtrack[align_distance].type = CODE_DEL; backtrack[align_distance].ix = i; backtrack[align_distance].jx = j; @@ -205,13 +209,15 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy */ #ifdef USE_INDEL -static int code_alignment(uint8_t *code, int score, int8_t *gen, int8_t *read, unsigned size_neighbour_in_symbols) +static int code_alignment(uint8_t *code, int score, int8_t *gen, int8_t *read, unsigned size_neighbour_in_symbols, bool *flag) { int code_idx, computed_score, backtrack_idx; int size_read = SIZE_READ; int size_neighbour = size_neighbour_in_symbols; backtrack_t backtrak[size_read]; + *flag = false; + if (score == 0) { code[0] = CODE_END; return 1; @@ -421,11 +427,13 @@ bool get_read_update_positions( genome_t *ref_genome, uint64_t genome_pos, int8_t *read, - __attribute__((unused))int size_neighbour_in_symbols) { + __attribute__((unused))int size_neighbour_in_symbols, + bool * flag, + uint32_t * substCnt) { // run smith and waterman algorithm to find indels uint8_t code_result_tab[256]; - code_alignment(code_result_tab, result_tab[pos].score, &ref_genome->data[genome_pos], read, size_neighbour_in_symbols); + code_alignment(code_result_tab, result_tab[pos].score, &ref_genome->data[genome_pos], read, size_neighbour_in_symbols, flag); if (code_result_tab[0] != CODE_ERR) { // array that will contain for each read position, the genome position that it matches too @@ -442,6 +450,7 @@ bool get_read_update_positions( if (code_result == CODE_SUB) { // do nothing for substitution code_result_index += 3; + (*substCnt)++; } else if (code_result == CODE_INS) { while(read_pos <= pos_variant_read) { @@ -510,48 +519,96 @@ bool update_frequency_table( #ifdef USE_INDEL - /*pthread_mutex_lock(&freq_table_mutex);*/ + static bool debug = false; + static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; uint64_t update_genome_position[SIZE_READ]; + uint32_t substCnt = 0; + flag_dbg = false; bool hasIndel = get_read_update_positions(update_genome_position, result_tab, pos, - ref_genome, genome_pos, read, size_neighbour_in_symbols); + ref_genome, genome_pos, read, size_neighbour_in_symbols, &flag_dbg, &substCnt); - if(hasIndel && false) { - static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; + if(hasIndel && debug) { + + assert(!result_tab[pos].coord.nodp); printf("Read:\n"); for(int k = 0; k < SIZE_READ; ++k) { printf("%c", nucleotide[read[k]]); } - printf("\ngenome:\n"); + printf("\ngenome (pos %u:%u):\n", result_tab[pos].coord.seed_nr, result_tab[pos].coord.seed_nr+SIZE_READ); for(uint64_t k = genome_pos; k < genome_pos + SIZE_READ; ++k) { printf("%c", nucleotide[ref_genome->data[k]]); } +/*#define RESET "\033[0m"*/ +/*#define RED "\033[31m" [> Red <]*/ printf("\nupdate pos:\n"); uint64_t lastpos = 0; for(uint64_t k = 0; k < SIZE_READ; ++k) { if(k && update_genome_position[k] != lastpos+1) { if(update_genome_position[k] == UINT64_MAX) - printf("No update at position %lu\n", k); + printf("X"); + /*printf("No update at position %lu\n", k);*/ else if(lastpos == UINT64_MAX) - printf("New start at pos %lu = %lu / %c\n", k, update_genome_position[k], nucleotide[ref_genome->data[update_genome_position[k]]]); + /*printf("New start at pos %lu = %lu / %c\n", k, update_genome_position[k], nucleotide[ref_genome->data[update_genome_position[k]]]);*/ + printf("%c", nucleotide[ref_genome->data[update_genome_position[k]]]); else - printf("Change at pos %lu, diff %ld, %c\n", k, update_genome_position[k] - lastpos, nucleotide[ref_genome->data[update_genome_position[k]]]); + /*printf("Change at pos %lu, diff %ld, %c\n", k, update_genome_position[k] - lastpos, nucleotide[ref_genome->data[update_genome_position[k]]]);*/ + printf("%c", nucleotide[ref_genome->data[update_genome_position[k]]]); } + /*else if(nucleotide[ref_genome->data[update_genome_position[k]]] != nucleotide[read[k]]) {*/ + /*printf(RED "%c" RESET, nucleotide[ref_genome->data[update_genome_position[k]]]);*/ + /*}*/ + else + printf("%c", nucleotide[ref_genome->data[update_genome_position[k]]]); lastpos = update_genome_position[k]; } + + printf("\nsubst:\n"); + for(uint64_t k = 0; k < SIZE_READ; ++k) { + if(update_genome_position[k] == UINT64_MAX) { + printf(" "); + continue; + } + else if(nucleotide[ref_genome->data[update_genome_position[k]]] != nucleotide[read[k]]) { + printf("U"); + substCnt++; + } + else + printf(" "); + } printf("\n\n"); + fflush(stdout); + } + else if(debug) { + if(!result_tab[pos].coord.nodp) { + printf("\nWarning: odpd result with no indels detected (flag = %d, subst cnt %u)):\n", flag_dbg, substCnt); + printf("Read:\n"); + for(int k = 0; k < SIZE_READ; ++k) { + printf("%c", nucleotide[read[k]]); + } + printf("\ngenome (pos %u:%u):\n", result_tab[pos].coord.seed_nr, result_tab[pos].coord.seed_nr+SIZE_READ); + for(uint64_t k = genome_pos; k < genome_pos + SIZE_READ; ++k) { + printf("%c", nucleotide[ref_genome->data[k]]); + } + printf("\n\n"); + fflush(stdout); + } } - for(uint64_t k = 0; k < SIZE_READ; ++k) { - uint64_t update_genome_pos = update_genome_position[k]; - if(update_genome_pos < genome_get()->fasta_file_size) { - frequency_table[read[k]][update_genome_pos].freq += mapq * read_quality[inv ? SIZE_READ - k - 1 : k]; - /*frequency_table[read[j]][genome_pos+j].score += result_tab[pos].score;*/ - frequency_table[read[k]][update_genome_pos].score++; - } - else if (update_genome_pos != UINT64_MAX) - printf("WARNING: genome update position computed is wrong %lu\n", update_genome_pos); + pthread_mutex_lock(&freq_table_mutex); + if(substCnt <= MAX_SUBSTITUTION && (hasIndel || result_tab[pos].coord.nodp)) { + for(uint64_t k = 0; k < SIZE_READ; ++k) { + uint64_t update_genome_pos = update_genome_position[k]; + if(update_genome_pos < genome_get()->fasta_file_size) { + frequency_table[read[k]][update_genome_pos].freq += mapq * read_quality[inv ? SIZE_READ - k - 1 : k]; + /*frequency_table[read[j]][genome_pos+j].score += result_tab[pos].score;*/ + frequency_table[read[k]][update_genome_pos].score++; + } + else if (update_genome_pos != UINT64_MAX) + printf("WARNING: genome update position computed is wrong %lu\n", update_genome_pos); + } } - /*pthread_mutex_unlock(&freq_table_mutex);*/ + /*fflush(stdout);*/ + pthread_mutex_unlock(&freq_table_mutex); return hasIndel; #else @@ -597,6 +654,7 @@ typedef struct { } process_read_arg_t; static uint64_t nr_reads_total = 0ULL; +static uint64_t nr_reads_total_from_dpus = 0ULL; static uint64_t nr_reads_non_mapped = 0ULL; static uint64_t nr_reads_with_indels = 0ULL; static pthread_mutex_t nr_reads_mutex = PTHREAD_MUTEX_INITIALIZER; @@ -703,6 +761,15 @@ static void do_process_read(process_read_arg_t *arg) bool update = false; bool hasIndel = false; + +#if 0 + for (unsigned int read = i; read < j; read++) { + + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, read, 1.0f, size_neighbour_in_symbols); + } + if(true) { +#endif + unsigned np = get_nb_scores(best_score); if (np > 0) { @@ -710,8 +777,8 @@ static void do_process_read(process_read_arg_t *arg) // found at least 2 matching pairs of positions. Check the delta between the two pairs to // decide whether we should keep the best pair - int delta = abs((MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]])) - - (MISMATCH_COUNT(result_tab[P1[1]]) + MISMATCH_COUNT(result_tab[P2[1]]))); + int delta = abs((int)(MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]])) + - (int)(MISMATCH_COUNT(result_tab[P1[1]]) + MISMATCH_COUNT(result_tab[P2[1]]))); float mapq = 1.0f; #ifdef USE_MAPQ_SCORE @@ -731,7 +798,7 @@ static void do_process_read(process_read_arg_t *arg) } } else if(np) { // only one result, take it - int delta = abs((MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]])) - (2 * (MAX_SUBSTITUTION + 1))); + int delta = abs((int)(MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]])) - (2 * (MAX_SUBSTITUTION + 1))); float mapq = 1.0f; #ifdef USE_MAPQ_SCORE int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((2 * (MAX_SUBSTITUTION + 1)) - delta); @@ -771,7 +838,7 @@ static void do_process_read(process_read_arg_t *arg) unsigned np1 = get_nb_scores(best_score_R1), np2 = get_nb_scores(best_score_R2); if(np1 == 2) { - int delta = abs(MISMATCH_COUNT(result_tab[P1[0]]) - MISMATCH_COUNT(result_tab[P1[1]])); + int delta = abs((int)MISMATCH_COUNT(result_tab[P1[0]]) - (int)MISMATCH_COUNT(result_tab[P1[1]])); float mapq = 1.0f; #ifdef USE_MAPQ_SCORE @@ -790,7 +857,7 @@ static void do_process_read(process_read_arg_t *arg) } } else if(np1) { - int delta = abs(MISMATCH_COUNT(result_tab[P1[0]]) - (MAX_SUBSTITUTION + 1)); + int delta = abs((int)MISMATCH_COUNT(result_tab[P1[0]]) - (MAX_SUBSTITUTION + 1)); float mapq = 1.0f; #ifdef USE_MAPQ_SCORE @@ -811,7 +878,7 @@ static void do_process_read(process_read_arg_t *arg) if(np2 == 2) { - int delta = abs(MISMATCH_COUNT(result_tab[P2[0]]) - MISMATCH_COUNT(result_tab[P2[1]])); + int delta = abs((int)MISMATCH_COUNT(result_tab[P2[0]]) - (int)MISMATCH_COUNT(result_tab[P2[1]])); float mapq = 1.0f; #ifdef USE_MAPQ_SCORE @@ -832,7 +899,7 @@ static void do_process_read(process_read_arg_t *arg) } } else if(np2) { - int delta = abs(MISMATCH_COUNT(result_tab[P2[0]]) - (MAX_SUBSTITUTION + 1)); + int delta = abs((int)MISMATCH_COUNT(result_tab[P2[0]]) - (MAX_SUBSTITUTION + 1)); float mapq = 1.0f; #ifdef USE_MAPQ_SCORE @@ -858,9 +925,9 @@ static void do_process_read(process_read_arg_t *arg) } if(hasIndel) nr_reads_with_indels++; - /*pthread_mutex_lock(&nr_reads_mutex);*/ - /*nr_reads_total++;*/ - /*pthread_mutex_unlock(&nr_reads_mutex);*/ + pthread_mutex_lock(&nr_reads_mutex); + nr_reads_total_from_dpus++; + pthread_mutex_unlock(&nr_reads_mutex); } } } @@ -876,7 +943,7 @@ void process_read(FILE *fpe1, FILE *fpe2, int round, unsigned int pass_id) int8_t *reads_buffer = get_reads_buffer(pass_id); float *reads_quality_buffer = get_reads_quality_buffer(pass_id); acc_results_t acc_res = accumulate_get_result(pass_id); - nr_reads_total += get_reads_in_buffer(pass_id); + nr_reads_total += get_reads_in_buffer(pass_id) / 4; curr_match = 0; @@ -934,7 +1001,9 @@ void process_read_free() assert(pthread_mutex_destroy(&curr_match_mutex) == 0); assert(pthread_mutex_destroy(&non_mapped_mutex) == 0); assert(pthread_mutex_destroy(&freq_table_mutex) == 0); - fprintf(stderr, "%% reads non mapped: %f%%\n", (float)nr_reads_non_mapped * 100.0 / (float)nr_reads_total); - fprintf(stderr, "%% reads with indels: %f%%\n", (float)nr_reads_with_indels * 100.0 / (float)(nr_reads_total - nr_reads_non_mapped)); + fflush(stdout); + fprintf(stderr, "%% reads non mapped: %f%%\n", (float)nr_reads_non_mapped * 100.0 / (float)nr_reads_total_from_dpus); + fprintf(stderr, "%% reads with indels: %f%%\n", (float)nr_reads_with_indels * 100.0 / (float)(nr_reads_total_from_dpus - nr_reads_non_mapped)); + fprintf(stderr, "%% Total reads from dpus: %ld%%\n", nr_reads_total_from_dpus); fprintf(stderr, "%% Total reads: %ld%%\n", nr_reads_total); } diff --git a/host/src/vartree.c b/host/src/vartree.c index 6e7a3eb..21ba0e0 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -359,6 +359,7 @@ void create_vcf() dbg_file = fopen("freq_debug.txt", "w"); sub_file = fopen("subst.txt", "r"); + assert(sub_file); unsigned seq = 0; uint64_t pos = 0; static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; @@ -382,7 +383,7 @@ void create_vcf() for(int m = 0; m < 5; ++m) { total += frequency_table[m][genome_pos].freq; } - for(int m = 0; m < 5; ++m) { + for(int m = 0; m < 4; ++m) { fprintf(dbg_file, "(%f %u %f %u) ", frequency_table[m][genome_pos].freq, frequency_table[m][genome_pos].score, frequency_table[m][genome_pos].freq * 100.0 / total, depth_filter(frequency_table[m][genome_pos].freq * 100.0 / total)); diff --git a/tests/compareVCF.py b/tests/compareVCF.py index cea6338..779526f 100755 --- a/tests/compareVCF.py +++ b/tests/compareVCF.py @@ -282,8 +282,8 @@ def compute_data(V_ref, V_upvc, len_ref, len_upvc, dump=False): tp_stat = stats["tp"] if args.enable_stat else None fp_stat = stats["fp"] if args.enable_stat else None - tp, fp = compute(V_upvc, V_ref, tp_stat, fp_stat, dump) - cm, fn = compute(V_ref, V_upvc, None, None, False) + tp, fp = compute(V_upvc, V_ref, tp_stat, fp_stat, False) + cm, fn = compute(V_ref, V_upvc, None, None, dump) # print_stat(tp_stat, fp_stat, tp, fp, cm) From 7a5391b177b94f1ba91a2d6d8fed8605b246f99f Mon Sep 17 00:00:00 2001 From: Julien Legriel Date: Thu, 20 May 2021 16:10:04 +0200 Subject: [PATCH 13/48] Integrate Dominique's fix. Seems to remove all missed indels on chr3. Need to check qor though. --- common/inc/common.h | 4 +-- host/src/processread.c | 59 ++++++++++++++---------------------------- 2 files changed, 21 insertions(+), 42 deletions(-) diff --git a/common/inc/common.h b/common/inc/common.h index 8b4f42a..8ba69d8 100644 --- a/common/inc/common.h +++ b/common/inc/common.h @@ -18,8 +18,8 @@ #define MAX_DPU_RESULTS (1 << 19) #define MAX_RESULTS_PER_READ (1 << 10) -#define SIZE_READ 148 -//#define SIZE_READ 120 +//#define SIZE_READ 148 +#define SIZE_READ 120 #define SIZE_SEED 14 #define SIZE_NEIGHBOUR_IN_BYTES ((SIZE_READ - SIZE_SEED) / 4) #define DELTA_NEIGHBOUR(round) ((SIZE_SEED * round) / 4) diff --git a/host/src/processread.c b/host/src/processread.c index 78fcca0..c72b3dd 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -32,9 +32,6 @@ #define PQD_INIT_VAL (99) -#define PATH_SUBSTITUTION (0) -#define PATH_INSERTION (1) -#define PATH_DELETION (2) #define MAX_SUBSTITUTION (4) static bool flag_dbg = false; @@ -48,7 +45,7 @@ typedef struct { static int min(int a, int b) { return a < b ? a : b; } static void DPD_compute( - int s1, int s2, int *Dij, int Dijm, int Dimj, int Dimjm, int *Pij, int Pijm, int *Qij, int Qimj, int *path) + int s1, int s2, int *Dij, int Dijm, int Dimj, int Dimjm, int *Pij, int Pijm, int *Qij, int Qimj) { int min_QP, d; @@ -57,10 +54,8 @@ static void DPD_compute( if (*Pij < *Qij) { min_QP = *Pij; - *path = PATH_INSERTION; } else { min_QP = *Qij; - *path = PATH_DELETION; } d = Dimjm; if ((s1 & 3) != (s2 & 3)) { @@ -68,7 +63,6 @@ static void DPD_compute( } if (d < min_QP) { *Dij = d; - *path = PATH_SUBSTITUTION; } else { *Dij = min_QP; } @@ -81,7 +75,6 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy int D[matrix_size][matrix_size]; int P[matrix_size][matrix_size]; int Q[matrix_size][matrix_size]; - int path[matrix_size][matrix_size]; int min_score = PQD_INIT_VAL; int min_score_i_idx = 0; int min_score_j_idx = 0; @@ -104,7 +97,7 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy for (int i = 1; i < diagonal; i++) { for (int j = 1; j < i + diagonal; j++) { DPD_compute(s1[i - 1], s2[j - 1], &D[i][j], D[i][j - 1], D[i - 1][j], D[i - 1][j - 1], &P[i][j], P[i][j - 1], - &Q[i][j], Q[i - 1][j], &path[i][j]); + &Q[i][j], Q[i - 1][j]); } Q[i][i + diagonal] = PQD_INIT_VAL; D[i][i + diagonal] = PQD_INIT_VAL; @@ -114,7 +107,7 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy D[i][i - diagonal] = PQD_INIT_VAL; for (int j = i - diagonal + 1; j < i + diagonal; j++) { DPD_compute(s1[i - 1], s2[j - 1], &D[i][j], D[i][j - 1], D[i - 1][j], D[i - 1][j - 1], &P[i][j], P[i][j - 1], - &Q[i][j], Q[i - 1][j], &path[i][j]); + &Q[i][j], Q[i - 1][j]); } Q[i][i + diagonal] = PQD_INIT_VAL; D[i][i + diagonal] = PQD_INIT_VAL; @@ -125,7 +118,7 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy D[i][i - diagonal] = PQD_INIT_VAL; for (int j = i - diagonal + 1; j < matrix_size; j++) { DPD_compute(s1[i - 1], s2[j - 1], &D[i][j], D[i][j - 1], D[i - 1][j], D[i - 1][j - 1], &P[i][j], P[i][j - 1], - &Q[i][j], Q[i - 1][j], &path[i][j]); + &Q[i][j], Q[i - 1][j]); } if (D[i][matrix_size - 1] < min_score) { min_score = D[i][matrix_size - 1]; @@ -144,22 +137,10 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy { int i = min_score_i_idx; int j = min_score_j_idx; - - /* Delete the INDELS at the ends */ - while (path[i][j] != PATH_SUBSTITUTION) { - if (path[i][j] == PATH_INSERTION) { - j--; - } else if (path[i][j] == PATH_DELETION) { - i--; - } else { - return -1; - } - } - - /* i>1 && j>1 conditions erased the INDELS at the beginning */ backtrack[0].type = CODE_END; - while ((i > 1) && (j > 1)) { - if (path[i][j] == PATH_SUBSTITUTION) { + while ((i > 0) && (j > 0)) { + int hv = (D[i-1][j] < D[i][j-1]) ? D[i-1][j] : D[i][j-1]; + if (D[i-1][j-1] <= hv) { i--; j--; if (D[i][j] != D[i - 1][j - 1]) { @@ -169,22 +150,20 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy align_distance++; } } else { - if (path[i][j] == PATH_INSERTION) { + if (D[i-1][j] > D[i][j-1]) { j--; flag_dbg = true; backtrack[align_distance].type = CODE_INS; backtrack[align_distance].ix = i; backtrack[align_distance].jx = j; align_distance++; - } else if (path[i][j] == PATH_DELETION) { + } else { i--; flag_dbg = true; backtrack[align_distance].type = CODE_DEL; backtrack[align_distance].ix = i; backtrack[align_distance].jx = j; align_distance++; - } else { - ERROR_EXIT(ERR_PROCESSREAD_DPD_FAILED, "Error during DPD compute"); } } } @@ -242,7 +221,7 @@ static int code_alignment(uint8_t *code, int score, int8_t *gen, int8_t *read, u return code_idx; /* Otherwise, re-compute the matrix (only some diagonals) and put in backtrack the path */ - backtrack_idx = DPD(&gen[SIZE_SEED], &read[SIZE_SEED], backtrak, size_neighbour_in_symbols); + backtrack_idx = DPD(gen, read, backtrak, size_neighbour_in_symbols); if (backtrack_idx == -1) { code[0] = CODE_ERR; return 1; @@ -253,28 +232,28 @@ static int code_alignment(uint8_t *code, int score, int8_t *gen, int8_t *read, u while (backtrack_idx > 0) { if (backtrak[backtrack_idx].type == CODE_SUB) { code[code_idx++] = CODE_SUB; - code[code_idx++] = backtrak[backtrack_idx].jx + SIZE_SEED - 1; - code[code_idx++] = read[backtrak[backtrack_idx].jx + SIZE_SEED - 1]; + code[code_idx++] = backtrak[backtrack_idx].jx - 1; + code[code_idx++] = read[backtrak[backtrack_idx].jx - 1]; backtrack_idx--; } else { if (backtrak[backtrack_idx].type == CODE_DEL) { int backtrack_jx = backtrak[backtrack_idx].jx; code[code_idx++] = CODE_DEL; - code[code_idx++] = backtrak[backtrack_idx].ix + SIZE_SEED; - code[code_idx++] = gen[backtrak[backtrack_idx].ix + SIZE_SEED] & 3; + code[code_idx++] = backtrak[backtrack_idx].ix; + code[code_idx++] = gen[backtrak[backtrack_idx].ix] & 3; backtrack_idx--; while ((backtrak[backtrack_idx].type == CODE_DEL) && (backtrack_jx == backtrak[backtrack_idx].jx)) { - code[code_idx++] = gen[backtrak[backtrack_idx].ix + SIZE_SEED] & 3; + code[code_idx++] = gen[backtrak[backtrack_idx].ix] & 3; backtrack_idx--; } } else { int backtrack_ix = backtrak[backtrack_idx].ix; code[code_idx++] = CODE_INS; - code[code_idx++] = backtrak[backtrack_idx].jx + SIZE_SEED - 1; - code[code_idx++] = read[backtrak[backtrack_idx].jx + SIZE_SEED]; + code[code_idx++] = backtrak[backtrack_idx].jx - 1; + code[code_idx++] = read[backtrak[backtrack_idx].jx]; backtrack_idx--; while ((backtrak[backtrack_idx].type == CODE_INS) && (backtrack_ix == backtrak[backtrack_idx].ix)) { - code[code_idx++] = read[backtrak[backtrack_idx].jx + SIZE_SEED]; + code[code_idx++] = read[backtrak[backtrack_idx].jx]; backtrack_idx--; } } @@ -519,7 +498,7 @@ bool update_frequency_table( #ifdef USE_INDEL - static bool debug = false; + static bool debug = true; static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; uint64_t update_genome_position[SIZE_READ]; uint32_t substCnt = 0; From 7affce1762cbda06970e018a9f07e995059db5bd Mon Sep 17 00:00:00 2001 From: Julien Legriel Date: Mon, 24 May 2021 16:18:11 +0200 Subject: [PATCH 14/48] Fix the issue by retrieving previous code to read SW result. And do not take into consideration the cases with multiple indels (pb with non increasing positions of indels). Also see some issue that some indels are still not detected, need to check. --- host/src/processread.c | 120 +++++++++++++++++++++++++++++++++-------- 1 file changed, 99 insertions(+), 21 deletions(-) diff --git a/host/src/processread.c b/host/src/processread.c index c72b3dd..8c3b670 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -10,6 +10,7 @@ #include #include "accumulateread.h" +#include "common.h" #include "genome.h" #include "getread.h" #include "processread.h" @@ -399,7 +400,7 @@ static void add_to_non_mapped_read(int numread, int round, FILE *fpe1, FILE *fpe } #ifdef USE_INDEL -bool get_read_update_positions( +int get_read_update_positions( uint64_t * update_genome_position, dpu_result_out_t *result_tab, int pos, @@ -421,41 +422,111 @@ bool get_read_update_positions( // with smith-waterman algorithm int64_t curr_pos = genome_pos; int code_result_index = 0; - int read_pos = 0; - bool hasIndel = false; + int64_t read_pos = 0; + int ref_pos = 0; + int nbIndels = 0; while (code_result_tab[code_result_index] != CODE_END) { int code_result = code_result_tab[code_result_index]; int64_t pos_variant_read = code_result_tab[code_result_index + 1]; + /*printf("pos variant: %lu\n", pos_variant_read);*/ + int64_t pos_variant_genome = genome_pos + pos_variant_read; if (code_result == CODE_SUB) { // do nothing for substitution code_result_index += 3; (*substCnt)++; + /*printf("S");*/ + ref_pos++; } else if (code_result == CODE_INS) { - while(read_pos <= pos_variant_read) { - update_genome_position[read_pos++] = curr_pos++; + if(nbIndels) { + // for the moment support only one indel otherwise we have some issue FIXME + printf("Z"); + break; } - // insertion, skip these positions in the read + + int64_t ps_var_genome = pos_variant_genome; + int64_t ps_var_read = pos_variant_read; code_result_index += 2; - while (code_result_tab[code_result_index] < 4) { - code_result_index++; + + while (code_result_tab[code_result_index] < 4) { + ps_var_read++; + code_result_index++; + } + + while (ref_genome->data[ps_var_genome] == read[ps_var_read] && ps_var_genome + && pos_variant_read && pos_variant_read >= read_pos) { + assert(ps_var_genome && ps_var_read && pos_variant_genome && pos_variant_read); + ps_var_genome--; + ps_var_read--; + pos_variant_genome--; + pos_variant_read--; + } + + /*newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3];*/ + ref_pos++; + + // skip first value which should be the equivalent of first element in ref genome + pos_variant_read++; + /*printf("read_pos %lu pos_variant_read %lu\n", read_pos, pos_variant_read);*/ + if(!nbIndels) + printf("SW results:\n"); + while(read_pos < pos_variant_read) { + assert(read_pos < SIZE_READ); + update_genome_position[read_pos++] = curr_pos++; + printf(" "); + } + /*printf("2 read_pos %lu pos_variant_read %lu\n", read_pos, pos_variant_read);*/ + assert(read_pos == pos_variant_read); + while (pos_variant_read <= ps_var_read) { update_genome_position[read_pos++] = UINT64_MAX; + pos_variant_read++; + printf("I"); } /*printf("Insertion pos %lu\n", pos_variant_read);*/ - hasIndel = true; + ++nbIndels; } else if (code_result == CODE_DEL) { - while(read_pos < pos_variant_read) { - update_genome_position[read_pos++] = curr_pos++; + if(nbIndels) { + // for the moment support only one indel otherwise we have some issue FIXME + printf("Z"); + break; } - // deletion, skip these positions in the reference genome + + int64_t ps_var_genome = pos_variant_genome; + int64_t ps_var_read = pos_variant_read; code_result_index += 2; - while (code_result_tab[code_result_index] < 4) { - code_result_index++; + + while (code_result_tab[code_result_index] < 4) { + ps_var_genome++; + code_result_index++; + } + + while (ref_genome->data[ps_var_genome] == read[ps_var_read] && pos_variant_genome && ps_var_read) { + assert(ps_var_genome && ps_var_read && pos_variant_genome && pos_variant_read); + ps_var_read--; + ps_var_genome--; + pos_variant_genome--; + pos_variant_read--; + } + + /*newvar->alt[alt_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3];*/ + if(!nbIndels) + printf("SW results:\n"); + + // skip first position which is same as in the read + while(read_pos <= pos_variant_read) { + update_genome_position[read_pos++] = curr_pos++; + printf(" "); + } + pos_variant_genome++; + while (pos_variant_genome <= ps_var_genome) { curr_pos++; + printf("D"); + pos_variant_genome++; + ref_pos++; } - /*printf("Deletion pos %lu\n", pos_variant_read);*/ - hasIndel = true; + pos_variant_genome -= ref_pos; + ++nbIndels; } else assert(0); @@ -463,7 +534,11 @@ bool get_read_update_positions( while(read_pos < SIZE_READ) update_genome_position[read_pos++] = curr_pos++; - return hasIndel; + if(nbIndels) { + printf("\n"); + fflush(stdout); + } + return nbIndels; } else assert(0); @@ -503,12 +578,13 @@ bool update_frequency_table( uint64_t update_genome_position[SIZE_READ]; uint32_t substCnt = 0; flag_dbg = false; - bool hasIndel = get_read_update_positions(update_genome_position, result_tab, pos, + pthread_mutex_lock(&freq_table_mutex); + int nbIndels = get_read_update_positions(update_genome_position, result_tab, pos, ref_genome, genome_pos, read, size_neighbour_in_symbols, &flag_dbg, &substCnt); + bool hasIndel = nbIndels; if(hasIndel && debug) { - assert(!result_tab[pos].coord.nodp); printf("Read:\n"); for(int k = 0; k < SIZE_READ; ++k) { printf("%c", nucleotide[read[k]]); @@ -556,6 +632,7 @@ bool update_frequency_table( } printf("\n\n"); fflush(stdout); + assert(!result_tab[pos].coord.nodp); } else if(debug) { if(!result_tab[pos].coord.nodp) { @@ -573,8 +650,9 @@ bool update_frequency_table( } } - pthread_mutex_lock(&freq_table_mutex); - if(substCnt <= MAX_SUBSTITUTION && (hasIndel || result_tab[pos].coord.nodp)) { + /*pthread_mutex_lock(&freq_table_mutex);*/ + // for the moment support only one indel otherwise we have some issue FIXME + if(substCnt <= MAX_SUBSTITUTION && (hasIndel || result_tab[pos].coord.nodp) && nbIndels < 2) { for(uint64_t k = 0; k < SIZE_READ; ++k) { uint64_t update_genome_pos = update_genome_position[k]; if(update_genome_pos < genome_get()->fasta_file_size) { From 7323e03a3ac30bcf0a19b688579fd4a280d21970 Mon Sep 17 00:00:00 2001 From: Julien Legriel Date: Wed, 2 Jun 2021 17:04:09 +0200 Subject: [PATCH 15/48] last fix from dominique in SW plus some debug. Still some issue in how I get the results in case of several indels. Will try to fix that --- host/src/processread.c | 166 +++++++++++++++++++++++++---------------- 1 file changed, 102 insertions(+), 64 deletions(-) diff --git a/host/src/processread.c b/host/src/processread.c index 8c3b670..cbfd753 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -46,26 +46,32 @@ typedef struct { static int min(int a, int b) { return a < b ? a : b; } static void DPD_compute( - int s1, int s2, int *Dij, int Dijm, int Dimj, int Dimjm, int *Pij, int Pijm, int *Qij, int Qimj) + int s1, int s2, int *Dij, int Dijm, int Dimj, int Dimjm, int *Pij, int Pijm, int *Qij, int Qimj, int *xij) { int min_QP, d; *Pij = min(Dijm + COST_GAPO, Pijm + COST_GAPE); *Qij = min(Dimj + COST_GAPO, Qimj + COST_GAPE); + *xij = 0; + int x; if (*Pij < *Qij) { min_QP = *Pij; + x = 2; } else { min_QP = *Qij; + x = 3; } d = Dimjm; if ((s1 & 3) != (s2 & 3)) { d += COST_SUB; + *xij = 1; } if (d < min_QP) { *Dij = d; } else { *Dij = min_QP; + *xij = x; } } @@ -76,6 +82,7 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy int D[matrix_size][matrix_size]; int P[matrix_size][matrix_size]; int Q[matrix_size][matrix_size]; + int X[matrix_size][matrix_size]; int min_score = PQD_INIT_VAL; int min_score_i_idx = 0; int min_score_j_idx = 0; @@ -98,7 +105,7 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy for (int i = 1; i < diagonal; i++) { for (int j = 1; j < i + diagonal; j++) { DPD_compute(s1[i - 1], s2[j - 1], &D[i][j], D[i][j - 1], D[i - 1][j], D[i - 1][j - 1], &P[i][j], P[i][j - 1], - &Q[i][j], Q[i - 1][j]); + &Q[i][j], Q[i - 1][j], &X[i][j]); } Q[i][i + diagonal] = PQD_INIT_VAL; D[i][i + diagonal] = PQD_INIT_VAL; @@ -108,7 +115,7 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy D[i][i - diagonal] = PQD_INIT_VAL; for (int j = i - diagonal + 1; j < i + diagonal; j++) { DPD_compute(s1[i - 1], s2[j - 1], &D[i][j], D[i][j - 1], D[i - 1][j], D[i - 1][j - 1], &P[i][j], P[i][j - 1], - &Q[i][j], Q[i - 1][j]); + &Q[i][j], Q[i - 1][j], &X[i][j]); } Q[i][i + diagonal] = PQD_INIT_VAL; D[i][i + diagonal] = PQD_INIT_VAL; @@ -119,7 +126,7 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy D[i][i - diagonal] = PQD_INIT_VAL; for (int j = i - diagonal + 1; j < matrix_size; j++) { DPD_compute(s1[i - 1], s2[j - 1], &D[i][j], D[i][j - 1], D[i - 1][j], D[i - 1][j - 1], &P[i][j], P[i][j - 1], - &Q[i][j], Q[i - 1][j]); + &Q[i][j], Q[i - 1][j], &X[i][j]); } if (D[i][matrix_size - 1] < min_score) { min_score = D[i][matrix_size - 1]; @@ -140,31 +147,48 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy int j = min_score_j_idx; backtrack[0].type = CODE_END; while ((i > 0) && (j > 0)) { - int hv = (D[i-1][j] < D[i][j-1]) ? D[i-1][j] : D[i][j-1]; - if (D[i-1][j-1] <= hv) { + /*int hv = (D[i-1][j] < D[i][j-1]) ? D[i-1][j] : D[i][j-1];*/ + /*if (D[i-1][j-1] <= hv) {*/ + if(X[i][j] == 0) { i--; j--; - if (D[i][j] != D[i - 1][j - 1]) { - backtrack[align_distance].type = CODE_SUB; - backtrack[align_distance].ix = i; - backtrack[align_distance].jx = j; - align_distance++; - } + //if (D[i][j] != D[i - 1][j - 1]) { + // backtrack[align_distance].type = CODE_SUB; + // backtrack[align_distance].ix = i; + // backtrack[align_distance].jx = j; + // align_distance++; + //} } else { - if (D[i-1][j] > D[i][j-1]) { + /*if (D[i-1][j] > D[i][j-1]) {*/ + if(X[i][j] == 1) { + i--; j--; - flag_dbg = true; - backtrack[align_distance].type = CODE_INS; + /*backtrack[align_distance].type = CODE_INS;*/ + backtrack[align_distance].type = CODE_SUB; backtrack[align_distance].ix = i; backtrack[align_distance].jx = j; align_distance++; } else { - i--; - flag_dbg = true; - backtrack[align_distance].type = CODE_DEL; - backtrack[align_distance].ix = i; - backtrack[align_distance].jx = j; - align_distance++; + if(X[i][j] == 2) { + j--; + backtrack[align_distance].type = CODE_INS; + backtrack[align_distance].ix = i; + backtrack[align_distance].jx = j; + align_distance++; + } + else { + i--; + backtrack[align_distance].type = CODE_DEL; + backtrack[align_distance].ix = i; + backtrack[align_distance].jx = j; + align_distance++; + } + //i--; + //flag_dbg = true; + //backtrack[align_distance].type = CODE_DEL; + //backtrack[align_distance].ix = i; + //backtrack[align_distance].jx = j; + //align_distance++; } } } @@ -222,7 +246,7 @@ static int code_alignment(uint8_t *code, int score, int8_t *gen, int8_t *read, u return code_idx; /* Otherwise, re-compute the matrix (only some diagonals) and put in backtrack the path */ - backtrack_idx = DPD(gen, read, backtrak, size_neighbour_in_symbols); + backtrack_idx = DPD(gen, read, backtrak, size_neighbour_in_symbols + SIZE_SEED); if (backtrack_idx == -1) { code[0] = CODE_ERR; return 1; @@ -409,22 +433,25 @@ int get_read_update_positions( int8_t *read, __attribute__((unused))int size_neighbour_in_symbols, bool * flag, + bool debug, uint32_t * substCnt) { // run smith and waterman algorithm to find indels uint8_t code_result_tab[256]; code_alignment(code_result_tab, result_tab[pos].score, &ref_genome->data[genome_pos], read, size_neighbour_in_symbols, flag); + for(int read_pos = 0; read_pos < SIZE_READ; ++read_pos) { + update_genome_position[read_pos] = 0; + } if (code_result_tab[0] != CODE_ERR) { // array that will contain for each read position, the genome position that it matches too // This is the genome position that will be updated in the frequency table // This genome position takes into account the shift due to possible indels found // with smith-waterman algorithm - int64_t curr_pos = genome_pos; int code_result_index = 0; - int64_t read_pos = 0; int ref_pos = 0; int nbIndels = 0; + bool ins = false; while (code_result_tab[code_result_index] != CODE_END) { int code_result = code_result_tab[code_result_index]; int64_t pos_variant_read = code_result_tab[code_result_index + 1]; @@ -434,16 +461,10 @@ int get_read_update_positions( // do nothing for substitution code_result_index += 3; (*substCnt)++; - /*printf("S");*/ ref_pos++; } else if (code_result == CODE_INS) { - if(nbIndels) { - // for the moment support only one indel otherwise we have some issue FIXME - printf("Z"); - break; - } - + ins = true; int64_t ps_var_genome = pos_variant_genome; int64_t ps_var_read = pos_variant_read; code_result_index += 2; @@ -454,7 +475,7 @@ int get_read_update_positions( } while (ref_genome->data[ps_var_genome] == read[ps_var_read] && ps_var_genome - && pos_variant_read && pos_variant_read >= read_pos) { + && pos_variant_read) { assert(ps_var_genome && ps_var_read && pos_variant_genome && pos_variant_read); ps_var_genome--; ps_var_read--; @@ -468,29 +489,21 @@ int get_read_update_positions( // skip first value which should be the equivalent of first element in ref genome pos_variant_read++; /*printf("read_pos %lu pos_variant_read %lu\n", read_pos, pos_variant_read);*/ - if(!nbIndels) - printf("SW results:\n"); - while(read_pos < pos_variant_read) { - assert(read_pos < SIZE_READ); - update_genome_position[read_pos++] = curr_pos++; - printf(" "); - } - /*printf("2 read_pos %lu pos_variant_read %lu\n", read_pos, pos_variant_read);*/ - assert(read_pos == pos_variant_read); while (pos_variant_read <= ps_var_read) { - update_genome_position[read_pos++] = UINT64_MAX; - pos_variant_read++; - printf("I"); + // position should not be updated yet + if(update_genome_position[pos_variant_read] != 0) { + printf("Warning: duplicate update (Insertion) at position %lu. Current %lu\n", + pos_variant_read, update_genome_position[pos_variant_read]); + fflush(stdout); + return -1; + } + /*assert(update_genome_position[pos_variant_read] == 0);*/ + update_genome_position[pos_variant_read++] = UINT64_MAX; } /*printf("Insertion pos %lu\n", pos_variant_read);*/ ++nbIndels; } else if (code_result == CODE_DEL) { - if(nbIndels) { - // for the moment support only one indel otherwise we have some issue FIXME - printf("Z"); - break; - } int64_t ps_var_genome = pos_variant_genome; int64_t ps_var_read = pos_variant_read; @@ -510,18 +523,21 @@ int get_read_update_positions( } /*newvar->alt[alt_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3];*/ - if(!nbIndels) - printf("SW results:\n"); - // skip first position which is same as in the read - while(read_pos <= pos_variant_read) { - update_genome_position[read_pos++] = curr_pos++; - printf(" "); + // on a deletion store the threshold to apply from the current read position + assert(ps_var_genome > pos_variant_genome); + if(pos_variant_read + 1 < SIZE_READ) { + // position should not be updated yet + if(update_genome_position[pos_variant_read+1] != 0) { + printf("Warning: duplicate update (Deletion) at position %lu. Current %lu\n", + pos_variant_read+1, update_genome_position[pos_variant_read+1]); + fflush(stdout); + return -1; + } + /*assert(update_genome_position[pos_variant_read+1] == 0);*/ + update_genome_position[pos_variant_read + 1] = ps_var_genome - pos_variant_genome; } - pos_variant_genome++; while (pos_variant_genome <= ps_var_genome) { - curr_pos++; - printf("D"); pos_variant_genome++; ref_pos++; } @@ -531,10 +547,31 @@ int get_read_update_positions( else assert(0); } - while(read_pos < SIZE_READ) - update_genome_position[read_pos++] = curr_pos++; + if(nbIndels && debug) + printf("SW algorithm (nbIndels %d) ins %d:\n", nbIndels, ins); + int64_t curr_pos = genome_pos; + for(int read_pos = 0; read_pos < SIZE_READ; ++read_pos) { + switch(update_genome_position[read_pos]) { + case 0: + update_genome_position[read_pos] = curr_pos++; + if(nbIndels && debug) + printf(" "); + break; + case UINT64_MAX: + if(nbIndels && debug) + printf("I"); + break; + default: + if(nbIndels && debug) { + for(uint64_t print_index = 0; print_index < update_genome_position[read_pos]; ++print_index) + printf("D"); + } + curr_pos += update_genome_position[read_pos]; + update_genome_position[read_pos] = curr_pos++; + } + } - if(nbIndels) { + if(nbIndels && debug) { printf("\n"); fflush(stdout); } @@ -580,8 +617,8 @@ bool update_frequency_table( flag_dbg = false; pthread_mutex_lock(&freq_table_mutex); int nbIndels = get_read_update_positions(update_genome_position, result_tab, pos, - ref_genome, genome_pos, read, size_neighbour_in_symbols, &flag_dbg, &substCnt); - bool hasIndel = nbIndels; + ref_genome, genome_pos, read, size_neighbour_in_symbols, &flag_dbg, debug, &substCnt); + bool hasIndel = nbIndels > 0; if(hasIndel && debug) { @@ -652,7 +689,7 @@ bool update_frequency_table( /*pthread_mutex_lock(&freq_table_mutex);*/ // for the moment support only one indel otherwise we have some issue FIXME - if(substCnt <= MAX_SUBSTITUTION && (hasIndel || result_tab[pos].coord.nodp) && nbIndels < 2) { + if(substCnt <= MAX_SUBSTITUTION && (hasIndel || result_tab[pos].coord.nodp) && nbIndels >= 0) { for(uint64_t k = 0; k < SIZE_READ; ++k) { uint64_t update_genome_pos = update_genome_position[k]; if(update_genome_pos < genome_get()->fasta_file_size) { @@ -763,6 +800,7 @@ static void do_process_read(process_read_arg_t *arg) FILE *fpe1 = arg->fpe1; FILE *fpe2 = arg->fpe2; unsigned int size_neighbour_in_symbols = (SIZE_NEIGHBOUR_IN_BYTES - DELTA_NEIGHBOUR(round)) * 4; + printf("size_neighbour_in_symbols : %u", size_neighbour_in_symbols); /* * The number of a pair is given by "num_read / 4 " (see dispatch_read function) From a62d0b901ea589d294d6a412cfd66716290d68f0 Mon Sep 17 00:00:00 2001 From: Julien Legriel Date: Fri, 19 Nov 2021 10:49:22 +0100 Subject: [PATCH 16/48] A bit of cosmetics Some modifications in SW results read which I probably need to revert --- common/inc/common.h | 4 +- host/src/genome.c | 6 +- host/src/getread.c | 6 +- host/src/processread.c | 149 +++++++++++++++++------------------------ host/src/vartree.c | 48 ++++++------- 5 files changed, 94 insertions(+), 119 deletions(-) diff --git a/common/inc/common.h b/common/inc/common.h index 8ba69d8..8b4f42a 100644 --- a/common/inc/common.h +++ b/common/inc/common.h @@ -18,8 +18,8 @@ #define MAX_DPU_RESULTS (1 << 19) #define MAX_RESULTS_PER_READ (1 << 10) -//#define SIZE_READ 148 -#define SIZE_READ 120 +#define SIZE_READ 148 +//#define SIZE_READ 120 #define SIZE_SEED 14 #define SIZE_NEIGHBOUR_IN_BYTES ((SIZE_READ - SIZE_SEED) / 4) #define DELTA_NEIGHBOUR(round) ((SIZE_SEED * round) / 4) diff --git a/host/src/genome.c b/host/src/genome.c index ca113bd..238f277 100644 --- a/host/src/genome.c +++ b/host/src/genome.c @@ -115,7 +115,11 @@ void genome_free() free(genome.mapping_coverage); } -//TODO free function +/** + * Frequency table + * 5 entries (A, C, T, G, -) + * Each entry is a table of the size of the reference genome + **/ static struct frequency_info* frequency_table[5]; static bool init_frequency_table = false; diff --git a/host/src/getread.c b/host/src/getread.c index 7f5572a..e5c672d 100644 --- a/host/src/getread.c +++ b/host/src/getread.c @@ -75,11 +75,9 @@ static int get_seq_fast_AQ(FILE *f, int8_t *read1, int8_t *read2, float *read_qu if (fgets(sequence_buffer, MAX_SEQ_SIZE, f) == NULL) { /* Line with sequence quality information */ return -1; } - //TODO: store quality information + // store quality information for (i = 0; i < SIZE_READ - offset; i++) { int Q = sequence_buffer[i]; - /*printf("index %d SIZE_READ %d\n", i, SIZE_READ);*/ - /*fflush(stdout);*/ read_quality_factor[i] = quality_lookup_table[Q-33]; } for (; i < SIZE_READ; i++) { @@ -115,8 +113,6 @@ void get_reads(FILE *fpe1, FILE *fpe2, unsigned int pass_id) } while (nb_read < MAX_READS_BUFFER) { - /*assert(nb_read/2 + 1 < MAX_READS_BUFFER/2);*/ - /*printf("nb_read %d MAX_READS_BUFFER %d\n", nb_read, MAX_READS_BUFFER);*/ if ((get_seq_fast_AQ(fpe1, &reads_buffer[(nb_read + 0) * SIZE_READ], &reads_buffer[(nb_read + 1) * SIZE_READ], &reads_quality_buffer[nb_read/2 * SIZE_READ]) <= 0) || (get_seq_fast_AQ(fpe2, &reads_buffer[(nb_read + 2) * SIZE_READ], &reads_buffer[(nb_read + 3) * SIZE_READ], diff --git a/host/src/processread.c b/host/src/processread.c index cbfd753..a2c3779 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -147,23 +147,13 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy int j = min_score_j_idx; backtrack[0].type = CODE_END; while ((i > 0) && (j > 0)) { - /*int hv = (D[i-1][j] < D[i][j-1]) ? D[i-1][j] : D[i][j-1];*/ - /*if (D[i-1][j-1] <= hv) {*/ if(X[i][j] == 0) { i--; j--; - //if (D[i][j] != D[i - 1][j - 1]) { - // backtrack[align_distance].type = CODE_SUB; - // backtrack[align_distance].ix = i; - // backtrack[align_distance].jx = j; - // align_distance++; - //} } else { - /*if (D[i-1][j] > D[i][j-1]) {*/ if(X[i][j] == 1) { i--; j--; - /*backtrack[align_distance].type = CODE_INS;*/ backtrack[align_distance].type = CODE_SUB; backtrack[align_distance].ix = i; backtrack[align_distance].jx = j; @@ -183,12 +173,6 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy backtrack[align_distance].jx = j; align_distance++; } - //i--; - //flag_dbg = true; - //backtrack[align_distance].type = CODE_DEL; - //backtrack[align_distance].ix = i; - //backtrack[align_distance].jx = j; - //align_distance++; } } } @@ -444,7 +428,7 @@ int get_read_update_positions( } if (code_result_tab[0] != CODE_ERR) { - // array that will contain for each read position, the genome position that it matches too + // array that contains for each read position, the genome position that it matches with // This is the genome position that will be updated in the frequency table // This genome position takes into account the shift due to possible indels found // with smith-waterman algorithm @@ -455,13 +439,11 @@ int get_read_update_positions( while (code_result_tab[code_result_index] != CODE_END) { int code_result = code_result_tab[code_result_index]; int64_t pos_variant_read = code_result_tab[code_result_index + 1]; - /*printf("pos variant: %lu\n", pos_variant_read);*/ - int64_t pos_variant_genome = genome_pos + pos_variant_read; + int64_t pos_variant_genome = genome_pos + pos_variant_read + ref_pos; if (code_result == CODE_SUB) { // do nothing for substitution code_result_index += 3; (*substCnt)++; - ref_pos++; } else if (code_result == CODE_INS) { ins = true; @@ -472,6 +454,7 @@ int get_read_update_positions( while (code_result_tab[code_result_index] < 4) { ps_var_read++; code_result_index++; + ref_pos--; } while (ref_genome->data[ps_var_genome] == read[ps_var_read] && ps_var_genome @@ -483,12 +466,8 @@ int get_read_update_positions( pos_variant_read--; } - /*newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3];*/ - ref_pos++; - // skip first value which should be the equivalent of first element in ref genome pos_variant_read++; - /*printf("read_pos %lu pos_variant_read %lu\n", read_pos, pos_variant_read);*/ while (pos_variant_read <= ps_var_read) { // position should not be updated yet if(update_genome_position[pos_variant_read] != 0) { @@ -497,10 +476,8 @@ int get_read_update_positions( fflush(stdout); return -1; } - /*assert(update_genome_position[pos_variant_read] == 0);*/ update_genome_position[pos_variant_read++] = UINT64_MAX; } - /*printf("Insertion pos %lu\n", pos_variant_read);*/ ++nbIndels; } else if (code_result == CODE_DEL) { @@ -512,6 +489,7 @@ int get_read_update_positions( while (code_result_tab[code_result_index] < 4) { ps_var_genome++; code_result_index++; + ref_pos++; } while (ref_genome->data[ps_var_genome] == read[ps_var_read] && pos_variant_genome && ps_var_read) { @@ -522,8 +500,6 @@ int get_read_update_positions( pos_variant_read--; } - /*newvar->alt[alt_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3];*/ - // on a deletion store the threshold to apply from the current read position assert(ps_var_genome > pos_variant_genome); if(pos_variant_read + 1 < SIZE_READ) { @@ -537,16 +513,18 @@ int get_read_update_positions( /*assert(update_genome_position[pos_variant_read+1] == 0);*/ update_genome_position[pos_variant_read + 1] = ps_var_genome - pos_variant_genome; } - while (pos_variant_genome <= ps_var_genome) { - pos_variant_genome++; - ref_pos++; - } - pos_variant_genome -= ref_pos; + //while (pos_variant_genome <= ps_var_genome) { + // pos_variant_genome++; + // ref_pos++; + //} + //pos_variant_genome -= ref_pos; ++nbIndels; } else assert(0); } + + // debug prints if(nbIndels && debug) printf("SW algorithm (nbIndels %d) ins %d:\n", nbIndels, ins); int64_t curr_pos = genome_pos; @@ -610,16 +588,21 @@ bool update_frequency_table( #ifdef USE_INDEL - static bool debug = true; + static bool debug = true; static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; uint64_t update_genome_position[SIZE_READ]; uint32_t substCnt = 0; flag_dbg = false; + + // for simplicity put all this in a critical section protected by a mutex + // since the frequency table is shared (but inefficient) + pthread_mutex_lock(&freq_table_mutex); int nbIndels = get_read_update_positions(update_genome_position, result_tab, pos, ref_genome, genome_pos, read, size_neighbour_in_symbols, &flag_dbg, debug, &substCnt); bool hasIndel = nbIndels > 0; + // debug prints if(hasIndel && debug) { printf("Read:\n"); @@ -629,47 +612,45 @@ bool update_frequency_table( printf("\ngenome (pos %u:%u):\n", result_tab[pos].coord.seed_nr, result_tab[pos].coord.seed_nr+SIZE_READ); for(uint64_t k = genome_pos; k < genome_pos + SIZE_READ; ++k) { printf("%c", nucleotide[ref_genome->data[k]]); - } -/*#define RESET "\033[0m"*/ -/*#define RED "\033[31m" [> Red <]*/ - printf("\nupdate pos:\n"); - uint64_t lastpos = 0; - for(uint64_t k = 0; k < SIZE_READ; ++k) { - if(k && update_genome_position[k] != lastpos+1) { - if(update_genome_position[k] == UINT64_MAX) - printf("X"); - /*printf("No update at position %lu\n", k);*/ - else if(lastpos == UINT64_MAX) - /*printf("New start at pos %lu = %lu / %c\n", k, update_genome_position[k], nucleotide[ref_genome->data[update_genome_position[k]]]);*/ - printf("%c", nucleotide[ref_genome->data[update_genome_position[k]]]); - else - /*printf("Change at pos %lu, diff %ld, %c\n", k, update_genome_position[k] - lastpos, nucleotide[ref_genome->data[update_genome_position[k]]]);*/ - printf("%c", nucleotide[ref_genome->data[update_genome_position[k]]]); - } - /*else if(nucleotide[ref_genome->data[update_genome_position[k]]] != nucleotide[read[k]]) {*/ - /*printf(RED "%c" RESET, nucleotide[ref_genome->data[update_genome_position[k]]]);*/ - /*}*/ - else - printf("%c", nucleotide[ref_genome->data[update_genome_position[k]]]); - lastpos = update_genome_position[k]; - } + } + printf("\nupdate pos:\n"); + uint64_t lastpos = 0; + for(uint64_t k = 0; k < SIZE_READ; ++k) { + if(k && update_genome_position[k] != lastpos+1) { + if(update_genome_position[k] == UINT64_MAX) + printf("X"); + /*printf("No update at position %lu\n", k);*/ + else if(lastpos == UINT64_MAX) + /*printf("New start at pos %lu = %lu / %c\n", k, update_genome_position[k], nucleotide[ref_genome->data[update_genome_position[k]]]);*/ + printf("%c", nucleotide[ref_genome->data[update_genome_position[k]]]); + else + /*printf("Change at pos %lu, diff %ld, %c\n", k, update_genome_position[k] - lastpos, nucleotide[ref_genome->data[update_genome_position[k]]]);*/ + printf("%c", nucleotide[ref_genome->data[update_genome_position[k]]]); + } + /*else if(nucleotide[ref_genome->data[update_genome_position[k]]] != nucleotide[read[k]]) {*/ + /*printf(RED "%c" RESET, nucleotide[ref_genome->data[update_genome_position[k]]]);*/ + /*}*/ + else + printf("%c", nucleotide[ref_genome->data[update_genome_position[k]]]); + lastpos = update_genome_position[k]; + } - printf("\nsubst:\n"); - for(uint64_t k = 0; k < SIZE_READ; ++k) { - if(update_genome_position[k] == UINT64_MAX) { - printf(" "); - continue; - } - else if(nucleotide[ref_genome->data[update_genome_position[k]]] != nucleotide[read[k]]) { - printf("U"); - substCnt++; - } - else - printf(" "); - } - printf("\n\n"); - fflush(stdout); - assert(!result_tab[pos].coord.nodp); + printf("\nsubst:\n"); + for(uint64_t k = 0; k < SIZE_READ; ++k) { + if(update_genome_position[k] == UINT64_MAX) { + printf(" "); + continue; + } + else if(nucleotide[ref_genome->data[update_genome_position[k]]] != nucleotide[read[k]]) { + printf("U"); + substCnt++; + } + else + printf(" "); + } + printf("\n\n"); + fflush(stdout); + assert(!result_tab[pos].coord.nodp); } else if(debug) { if(!result_tab[pos].coord.nodp) { @@ -712,11 +693,11 @@ bool update_frequency_table( frequency_table[read[j]][genome_pos+j].freq += mapq * read_quality[inv ? SIZE_READ - j - 1 : j]; frequency_table[read[j]][genome_pos+j].score++; } - else - printf("WARNING: reads matched at position that exceeds genome size\n"); - } - pthread_mutex_unlock(&freq_table_mutex); - return false; + else + printf("WARNING: reads matched at position that exceeds genome size\n"); + } + pthread_mutex_unlock(&freq_table_mutex); + return false; #endif } @@ -800,7 +781,7 @@ static void do_process_read(process_read_arg_t *arg) FILE *fpe1 = arg->fpe1; FILE *fpe2 = arg->fpe2; unsigned int size_neighbour_in_symbols = (SIZE_NEIGHBOUR_IN_BYTES - DELTA_NEIGHBOUR(round)) * 4; - printf("size_neighbour_in_symbols : %u", size_neighbour_in_symbols); + /*printf("size_neighbour_in_symbols : %u", size_neighbour_in_symbols);*/ /* * The number of a pair is given by "num_read / 4 " (see dispatch_read function) @@ -857,14 +838,6 @@ static void do_process_read(process_read_arg_t *arg) bool update = false; bool hasIndel = false; -#if 0 - for (unsigned int read = i; read < j; read++) { - - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, read, 1.0f, size_neighbour_in_symbols); - } - if(true) { -#endif - unsigned np = get_nb_scores(best_score); if (np > 0) { diff --git a/host/src/vartree.c b/host/src/vartree.c index 21ba0e0..38528f7 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -163,6 +163,7 @@ static bool print_variant_tree(variant_t *var, uint32_t seq_nr, uint64_t seq_pos uint32_t cov = ref_genome->mapping_coverage[genome_pos]; uint32_t depth = var->depth; uint32_t score = var->score / depth; + // Note: commenting out the old version of variant calling, now using the frequency table //uint32_t percentage = 100; //if (cov != 0) { // percentage = depth * 100 / cov; @@ -208,14 +209,12 @@ static bool print_variant_tree(variant_t *var, uint32_t seq_nr, uint64_t seq_pos return true; } -/* -great that you have implemented the quality score weights in the frequency table. I think using D=1 is not advisable since the number of false positives becomes extremely high, also with D=2 we need to be very careful. Thus, I would recommend to test the accuracy using Q-scores with our previously used parameters first (e.g. D>=3 and 20%, and the ones I have suggested in (1) (i, ii, and iii) and compare them to our old results. - -(i) D=3: 25%, D=4: 20%, D=5: 15%; D>=6: 10% -(ii) D=2: 30%, D=3: 25%, D=4: 20%, D=5: 15%; D>=6: 10% -(iii) D=3: 20%, D=4: 15%, D>=5: 10%; - * - * */ +/** + Few configurations suggested by Bertil + (i) D=3: 25%, D=4: 20%, D=5: 15%; D>=6: 10% + (ii) D=2: 30%, D=3: 25%, D=4: 20%, D=5: 15%; D>=6: 10% + (iii) D=3: 20%, D=4: 15%, D>=5: 10%; + **/ __attribute__((unused)) uint32_t depth_filter1(float freq) { if(freq < 10.0f) @@ -295,21 +294,21 @@ static variant_t ** get_most_frequent_variant(genome_t * ref_genome, struct freq for(int i = 0; i < 5; ++i) { float freq = frequency_table[i][genome_pos].freq; uint32_t score = frequency_table[i][genome_pos].score; - if(i == ref_genome->data[genome_pos]) continue; // not a variant if the same nucleotide as in reference genome - /*if((freq / total > FREQUENCY_THRESHOLD) */ - if(score >= depth_filter(freq * 100.0 / total)) { // if frequency and depth pass the threshold, consider it a variant - - /*printf("variant depth %u freq %f threshold %u\n", score, freq,*/ - /*depth_filter(freq * 100.0 / total));*/ - // this is a substitution, create variant - variant_t *var = (variant_t *)malloc(sizeof(variant_t)); - var->score = frequency_table[i][genome_pos].score; - var->depth = frequency_table[i][genome_pos].score; - var->ref[0] = nucleotide[ref_genome->data[genome_pos]]; - var->ref[1] = '\0'; - var->alt[0] = nucleotide[i]; - var->alt[1] = '\0'; - results[i] = var; + if(i == ref_genome->data[genome_pos]) + continue; // not a variant if the same nucleotide as in reference genome + + // if frequency and depth pass the threshold, consider it a variant + if(score >= depth_filter(freq * 100.0 / total)) { + + // this is a substitution, create variant + variant_t *var = (variant_t *)malloc(sizeof(variant_t)); + var->score = frequency_table[i][genome_pos].score; + var->depth = frequency_table[i][genome_pos].score; + var->ref[0] = nucleotide[ref_genome->data[genome_pos]]; + var->ref[1] = '\0'; + var->alt[0] = nucleotide[i]; + var->alt[1] = '\0'; + results[i] = var; } } //printf("get_most_frequent_variant: genome_pos %lu, nucleotide max freq %d %f %c\n", genome_pos, nucId, max, nucId >= 0 ? nucleotide[nucId] : '-'); @@ -357,6 +356,9 @@ void create_vcf() struct frequency_info **frequency_table = get_frequency_table(); uint32_t nb_pos_multiple_var = 0; + /** + * Dump debugging information: frequency table for a given set of positions + **/ dbg_file = fopen("freq_debug.txt", "w"); sub_file = fopen("subst.txt", "r"); assert(sub_file); From 53e3e71f3a859818fe60b03b857fd80957a6cc83 Mon Sep 17 00:00:00 2001 From: Julien Legriel Date: Mon, 22 Nov 2021 15:00:09 +0100 Subject: [PATCH 17/48] Disable debug code. Revert latest modifications in SW results parsing. --- host/src/processread.c | 21 ++++++++++++--------- host/src/vartree.c | 2 ++ 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/host/src/processread.c b/host/src/processread.c index a2c3779..4bed7b2 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -439,11 +439,13 @@ int get_read_update_positions( while (code_result_tab[code_result_index] != CODE_END) { int code_result = code_result_tab[code_result_index]; int64_t pos_variant_read = code_result_tab[code_result_index + 1]; - int64_t pos_variant_genome = genome_pos + pos_variant_read + ref_pos; + /*printf("pos variant: %lu\n", pos_variant_read);*/ + int64_t pos_variant_genome = genome_pos + pos_variant_read; if (code_result == CODE_SUB) { // do nothing for substitution code_result_index += 3; (*substCnt)++; + ref_pos++; } else if (code_result == CODE_INS) { ins = true; @@ -454,7 +456,6 @@ int get_read_update_positions( while (code_result_tab[code_result_index] < 4) { ps_var_read++; code_result_index++; - ref_pos--; } while (ref_genome->data[ps_var_genome] == read[ps_var_read] && ps_var_genome @@ -466,6 +467,9 @@ int get_read_update_positions( pos_variant_read--; } + /*newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3];*/ + ref_pos++; + // skip first value which should be the equivalent of first element in ref genome pos_variant_read++; while (pos_variant_read <= ps_var_read) { @@ -489,7 +493,6 @@ int get_read_update_positions( while (code_result_tab[code_result_index] < 4) { ps_var_genome++; code_result_index++; - ref_pos++; } while (ref_genome->data[ps_var_genome] == read[ps_var_read] && pos_variant_genome && ps_var_read) { @@ -513,11 +516,11 @@ int get_read_update_positions( /*assert(update_genome_position[pos_variant_read+1] == 0);*/ update_genome_position[pos_variant_read + 1] = ps_var_genome - pos_variant_genome; } - //while (pos_variant_genome <= ps_var_genome) { - // pos_variant_genome++; - // ref_pos++; - //} - //pos_variant_genome -= ref_pos; + while (pos_variant_genome <= ps_var_genome) { + pos_variant_genome++; + ref_pos++; + } + pos_variant_genome -= ref_pos; ++nbIndels; } else @@ -588,7 +591,7 @@ bool update_frequency_table( #ifdef USE_INDEL - static bool debug = true; + static bool debug = false; static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; uint64_t update_genome_position[SIZE_READ]; uint32_t substCnt = 0; diff --git a/host/src/vartree.c b/host/src/vartree.c index 38528f7..4a62892 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -359,6 +359,7 @@ void create_vcf() /** * Dump debugging information: frequency table for a given set of positions **/ +#if 0 dbg_file = fopen("freq_debug.txt", "w"); sub_file = fopen("subst.txt", "r"); assert(sub_file); @@ -395,6 +396,7 @@ void create_vcf() } fclose(dbg_file); fclose(sub_file); +#endif /* for each sequence in the genome */ for (uint32_t seq_number = 0; seq_number < ref_genome->nb_seq; seq_number++) { From eb87cfec1ddb72c73de6987f6656b41f9818cd72 Mon Sep 17 00:00:00 2001 From: amoisson Date: Tue, 23 Nov 2021 15:04:19 +0100 Subject: [PATCH 18/48] created a debugging and logging framework for read mappings --- host/inc/debug.h | 72 +++++++++++++++ host/inc/processread.h | 6 ++ host/inc/sam.h | 13 +++ host/src/processread.c | 33 +++++-- host/src/sam.c | 193 +++++++++++++++++++++++++++++++++++++++++ host/src/upvc.c | 1 + host/src/vartree.c | 2 +- 7 files changed, 311 insertions(+), 9 deletions(-) create mode 100644 host/inc/debug.h create mode 100644 host/inc/sam.h create mode 100644 host/src/sam.c diff --git a/host/inc/debug.h b/host/inc/debug.h new file mode 100644 index 0000000..7c3dfea --- /dev/null +++ b/host/inc/debug.h @@ -0,0 +1,72 @@ +#define V_QUIET 0 +#define V_FATAL 1 +#define V_ERROR 2 +#define V_WARN 3 +#define V_INFO 4 +#define V_DEBUG 5 +#define V_TRACE 6 + +#define VERBOSE V_TRACE +#define VERBOSE_COLORS true +#define VERBOSE_LOG_LEVEL true + +#if VERBOSE_COLORS +#define VERBOSE_COLOR_START_FATAL "\033[41m" +#define VERBOSE_COLOR_START_ERROR "\033[31m" +#define VERBOSE_COLOR_START_WARN "\033[33m" +#define VERBOSE_COLOR_START_INFO "\033[32m" +#define VERBOSE_COLOR_START_DEBUG "\033[34m" +#define VERBOSE_COLOR_START_TRACE "\033[36m" +#define VERBOSE_COLOR_END "\033[0m" +#else +#define VERBOSE_COLOR_START_FATAL +#define VERBOSE_COLOR_START_ERROR +#define VERBOSE_COLOR_START_WARN +#define VERBOSE_COLOR_START_INFO +#define VERBOSE_COLOR_START_DEBUG +#define VERBOSE_COLOR_START_TRACE +#define VERBOSE_COLOR_END +#endif + +#if VERBOSE_LOG_LEVEL +#define VERBOSE_PRINT_PREFIX(level) VERBOSE_COLOR_START_##level #level "\t" VERBOSE_COLOR_END +#else +#define VERBOSE_PRINT_PREFIX(level) +#endif + + +#if VERBOSE>=V_TRACE +#define LOG_TRACE(...) fprintf(stderr, VERBOSE_PRINT_PREFIX(TRACE) __VA_ARGS__) +#else +#define LOG_TRACE(...) +#endif + +#if VERBOSE>=V_DEBUG +#define LOG_DEBUG(...) fprintf(stderr, VERBOSE_PRINT_PREFIX(DEBUG) __VA_ARGS__) +#else +#define LOG_DEBUG(...) +#endif + +#if VERBOSE>=V_INFO +#define LOG_INFO(...) fprintf(stderr, VERBOSE_PRINT_PREFIX(INFO) __VA_ARGS__) +#else +#define LOG_INFO(...) +#endif + +#if VERBOSE>=V_WARN +#define LOG_WARN(...) fprintf(stderr, VERBOSE_PRINT_PREFIX(WARN) __VA_ARGS__) +#else +#define LOG_WARN(...) +#endif + +#if VERBOSE>=V_ERROR +#define LOG_ERROR(...) fprintf(stderr, VERBOSE_PRINT_PREFIX(ERROR) __VA_ARGS__) +#else +#define LOG_ERROR(...) +#endif + +#if VERBOSE>=V_FATAL +#define LOG_FATAL(...) fprintf(stderr, VERBOSE_PRINT_PREFIX(FATAL) __VA_ARGS__) +#else +#define LOG_FATAL(...) +#endif diff --git a/host/inc/processread.h b/host/inc/processread.h index acb3cb3..09ad7d8 100644 --- a/host/inc/processread.h +++ b/host/inc/processread.h @@ -7,6 +7,12 @@ #include +#define CODE_SUB 10 +#define CODE_DEL 11 +#define CODE_INS 12 +#define CODE_END 13 +#define CODE_ERR 14 + void process_read(FILE *fpe1, FILE *fpe2, int round, unsigned int pass_id); void process_read_init(); diff --git a/host/inc/sam.h b/host/inc/sam.h new file mode 100644 index 0000000..d70cad6 --- /dev/null +++ b/host/inc/sam.h @@ -0,0 +1,13 @@ +/** + * Copyright 2021 - A Moisson-Franckhauser & UPMEM + */ +#ifndef __SAM_H__ +#define __SAM_H__ + +void open_sam_file(); +//TODO : either reuse this code or delete it +//void write_sam_read(uint64_t genome_pos, uint8_t *code, int8_t *read); +void write_read_mapping(uint64_t genome_pos, uint8_t *code); +void close_sam_file(); + +#endif /* __SAM_H__ */ diff --git a/host/src/processread.c b/host/src/processread.c index 4bed7b2..7d152be 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -11,21 +11,19 @@ #include "accumulateread.h" #include "common.h" +#include "debug.h" #include "genome.h" #include "getread.h" #include "processread.h" +#include "sam.h" #include "upvc.h" #include "vartree.h" +#define DEBUG_READ_MAPPING true + #define SIZE_INSERT_MEAN (400) #define SIZE_INSERT_STD (3 * 50) -#define CODE_SUB 10 -#define CODE_DEL 11 -#define CODE_INS 12 -#define CODE_END 13 -#define CODE_ERR 14 - #define CODE_A 0 /* ('A'>>1)&3 41H 0100 0001 */ #define CODE_C 1 /* ('C'>>1)&3 43H 0100 0011 */ #define CODE_T 2 /* ('T'>>1)&3 54H 0101 0100 */ @@ -246,25 +244,29 @@ static int code_alignment(uint8_t *code, int score, int8_t *gen, int8_t *read, u backtrack_idx--; } else { if (backtrak[backtrack_idx].type == CODE_DEL) { - int backtrack_jx = backtrak[backtrack_idx].jx; + //int backtrack_jx = backtrak[backtrack_idx].jx; code[code_idx++] = CODE_DEL; code[code_idx++] = backtrak[backtrack_idx].ix; code[code_idx++] = gen[backtrak[backtrack_idx].ix] & 3; backtrack_idx--; + /* while ((backtrak[backtrack_idx].type == CODE_DEL) && (backtrack_jx == backtrak[backtrack_idx].jx)) { code[code_idx++] = gen[backtrak[backtrack_idx].ix] & 3; backtrack_idx--; } + */ } else { - int backtrack_ix = backtrak[backtrack_idx].ix; + //int backtrack_ix = backtrak[backtrack_idx].ix; code[code_idx++] = CODE_INS; code[code_idx++] = backtrak[backtrack_idx].jx - 1; code[code_idx++] = read[backtrak[backtrack_idx].jx]; backtrack_idx--; + /* while ((backtrak[backtrack_idx].type == CODE_INS) && (backtrack_ix == backtrak[backtrack_idx].ix)) { code[code_idx++] = read[backtrak[backtrack_idx].jx]; backtrack_idx--; } + */ } } } @@ -283,6 +285,7 @@ static void set_variant( char nucleotide[4] = { 'A', 'C', 'T', 'G' }; uint64_t genome_pos = ref_genome->pt_seq[result_match.coord.seq_nr] + result_match.coord.seed_nr; int size_read = SIZE_READ; + //LOG_TRACE("set_variant called\n"); /* Get the differences betweend the read and the sequence of the reference genome that match */ read = &reads_buffer[result_match.num * size_read]; @@ -295,9 +298,15 @@ static void set_variant( ref_genome->mapping_coverage[genome_pos + i] += 1; } +#if DEBUG_READ_MAPPING + // TODO: check genome_pos is the expected value + write_read_mapping(genome_pos, code_result_tab); +#endif + code_result_idx = 0; while (code_result_tab[code_result_idx] != CODE_END) { int code_result = code_result_tab[code_result_idx]; + //LOG_DEBUG("code_result=%d\n", code_result); int64_t pos_variant_read = code_result_tab[code_result_idx + 1]; int64_t pos_variant_genome = genome_pos + pos_variant_read; int ref_pos = 0; @@ -364,6 +373,7 @@ static void set_variant( newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3]; if (ref_pos >= MAX_SIZE_ALLELE - 1) { free(newvar); + //LOG_TRACE("set_variant early return\n"); return; } pos_variant_genome++; @@ -375,6 +385,7 @@ static void set_variant( variant_tree_insert( newvar, result_match.coord.seq_nr, pos_variant_genome + 1 - ref_genome->pt_seq[result_match.coord.seq_nr]); } + //LOG_TRACE("set_variant return\n"); } #endif @@ -1046,6 +1057,9 @@ static void *process_read_thread_fct(void *arg) void process_read_init() { +#if DEBUG_READ_MAPPING + open_sam_file(); +#endif genome_t *ref_genome = genome_get(); args.ref_genome = ref_genome; @@ -1061,6 +1075,9 @@ void process_read_init() void process_read_free() { +#if DEBUG_READ_MAPPING + close_sam_file(); +#endif stop_threads = true; pthread_barrier_wait(&barrier); diff --git a/host/src/sam.c b/host/src/sam.c new file mode 100644 index 0000000..d11a370 --- /dev/null +++ b/host/src/sam.c @@ -0,0 +1,193 @@ +/** + * Copyright 2021 - A. Moisson-Franckhauser & UPMEM + */ + +#include +#include + +#include "index.h" +#include "sam.h" +#include "genome.h" +#include "common.h" +#include "parse_args.h" +#include "processread.h" +#include "debug.h" + +#define SAM_FILENAME "read_alignments.sam" +#define SAM_VERSION "1.6" +#define PROGRAM_NAME "upvc" +#define MAX_CIGAR_LENGTH (3*SIZE_READ) +#define MAX_PATCH_LENGTH (3*SIZE_READ) + +#define CIGAR_MATCH '=' +#define CIGAR_MISMATCH 'X' +#define CIGAR_INSERT 'I' +#define CIGAR_DELETE 'D' + +FILE *sam_file; + +static char *get_sam_filename() +{ + static char filename[FILENAME_MAX]; + sprintf(filename, "%s", SAM_FILENAME); + LOG_TRACE("sam filename : \"%s\"\n", filename); + return filename; +} + +void open_sam_file() +{ + LOG_DEBUG("opening sam file\n"); + // TODO: check for memory leaks here + char *filename = get_sam_filename(); + sam_file = fopen(filename, "w"); + if (sam_file == NULL) + { + LOG_FATAL("couldn't open sam file; errno : %u\n", errno); + } + LOG_DEBUG("openned sam file : %p\n", sam_file); + // TODO: complete header + LOG_TRACE("writing sam header\n"); + LOG_DEBUG("written test line in sam file\n"); + fprintf(sam_file, "@HD VN:" SAM_VERSION " SO:unknown\n"); + fprintf(sam_file, "@PG ID:1 PN:" PROGRAM_NAME "\n"); + LOG_DEBUG("sam header written\n"); +} + +/* + * TODO: Either reuse this code or delete it +void write_sam_read(uint64_t genome_pos, uint8_t *code, int8_t *read) +{ + const unsigned int flag = 0x0; + const uint8_t mapping_quality = 255;// unknown quality; TODO: set quality + + char cigar[MAX_CIGAR_LENGTH]; + uint32_t cigar_idx = 0; + char last_code = 0; + unsigned int code_count = 0; + int last_position = -1; + int tlen = 0;//TODO: figure out what tlen is supposed to be + + char sequence[SIZE_READ+1]; + char nucleotide[4] = {'A', 'C', 'T', 'G'}; + + for (uint32_t code_idx=0; code[code_idx] != CODE_END;) + { + char new_code=0; + int new_position=last_position+1; + switch (code[code_idx++]) + { + case CODE_SUB: + new_code = CIGAR_MISMATCH; + new_position = code[code_idx++]; + code_idx++; + break; + case CODE_INS: + new_code = CIGAR_INSERT; + new_position = code[code_idx++]; + code_idx++; + break; + case CODE_DEL: + new_code = CIGAR_DELETE; + new_position = code[code_idx++]; + code_idx++; + break; + } + if (last_code != new_code) + { + if (code_count>0) + { + cigar_idx += sprintf(cigar+cigar_idx, "%u%c", code_count, last_code); + } + last_code = new_code; + code_count = 0; + } + if (new_position > last_position+1) + { + if (code_count > 0) + { + cigar_idx += sprintf(cigar+cigar_idx, "%u%c", code_count, last_code); + code_count = 0; + } + cigar_idx += sprintf(cigar+cigar_idx, "%u%c", new_position-last_position-1, CIGAR_MATCH); + } + code_count++; + last_position = new_position; + } + if (code_count > 0) + { + cigar_idx += sprintf(cigar+cigar_idx, "%u%c", code_count, last_code); + } + if (SIZE_READ > last_position+1) + { + cigar_idx += sprintf(cigar+cigar_idx, "%u%c", SIZE_READ-last_position-1, CIGAR_MATCH); + } + cigar[cigar_idx] = 0;//ensure last string character is 0 + LOG_TRACE("cigar : \"%s\"\n", cigar); + + for (uint32_t i=0; i #include "accumulateread.h" +#include "debug.h" #include "dispatch.h" #include "dpu_backend.h" #include "genome.h" diff --git a/host/src/vartree.c b/host/src/vartree.c index 4a62892..1197b85 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -105,7 +105,7 @@ depth_filter_t indel_filter[] = { [10] = { 1, 30 }, [11] = { 1, 40 }, }; -#elif (SIZE_READ == 150) +#elif (SIZE_READ == 150) || (SIZE_READ==148) depth_filter_t sub_filter[] = { [3] = { 15, 16 }, [4] = { 17, 20 }, From 417aea7e0714cd6d8481a2e9a36528281096b097 Mon Sep 17 00:00:00 2001 From: amoisson Date: Tue, 7 Dec 2021 15:07:11 +0100 Subject: [PATCH 19/48] improved read mapping debugging framework and used better constants for mapping filters --- common/inc/common.h | 1 - dpu/compile_commands.json | 1 + dpu/src/task.c | 6 +- host/inc/mapping_file.h | 13 ++ host/inc/sam.h | 13 -- host/src/{sam.c => mapping_file.c} | 92 +++++++++----- host/src/processread.c | 48 ++++---- tests/igvlike-focus.py | 188 +++++++++++++++++++++++++++++ 8 files changed, 294 insertions(+), 68 deletions(-) create mode 120000 dpu/compile_commands.json create mode 100644 host/inc/mapping_file.h delete mode 100644 host/inc/sam.h rename host/src/{sam.c => mapping_file.c} (61%) create mode 100755 tests/igvlike-focus.py diff --git a/common/inc/common.h b/common/inc/common.h index 8b4f42a..0e66dc1 100644 --- a/common/inc/common.h +++ b/common/inc/common.h @@ -19,7 +19,6 @@ #define MAX_RESULTS_PER_READ (1 << 10) #define SIZE_READ 148 -//#define SIZE_READ 120 #define SIZE_SEED 14 #define SIZE_NEIGHBOUR_IN_BYTES ((SIZE_READ - SIZE_SEED) / 4) #define DELTA_NEIGHBOUR(round) ((SIZE_SEED * round) / 4) diff --git a/dpu/compile_commands.json b/dpu/compile_commands.json new file mode 120000 index 0000000..3f0d8f8 --- /dev/null +++ b/dpu/compile_commands.json @@ -0,0 +1 @@ +/home/upmemstaff/amoisson/Work/usecase_upvc/build/dpu/compile_commands.json \ No newline at end of file diff --git a/dpu/src/task.c b/dpu/src/task.c index ffd96a6..5b0ab5e 100644 --- a/dpu/src/task.c +++ b/dpu/src/task.c @@ -55,7 +55,11 @@ __host dpu_compute_time_t DPU_COMPUTE_TIME_VAR; /** * @brief Maximum score allowed. */ -#define MAX_SCORE (40) +#if SIZE_READ>120 +#define MAX_SCORE 340 +#else +#define MAX_SCORE 225 +#endif /** * @brief Number of reference read to be fetch per mram read diff --git a/host/inc/mapping_file.h b/host/inc/mapping_file.h new file mode 100644 index 0000000..a3f1679 --- /dev/null +++ b/host/inc/mapping_file.h @@ -0,0 +1,13 @@ +/** + * Copyright 2021 - A Moisson-Franckhauser & UPMEM + */ +#ifndef __SAM_H__ +#define __SAM_H__ + +void open_mapping_file(); +//TODO : either reuse this code or delete it +//void write_mapping_read(uint64_t genome_pos, uint8_t *code, int8_t *read); +void write_read_mapping(char *chromosome_name, uint64_t genome_pos, uint8_t *code); +void close_mapping_file(); + +#endif /* __SAM_H__ */ diff --git a/host/inc/sam.h b/host/inc/sam.h deleted file mode 100644 index d70cad6..0000000 --- a/host/inc/sam.h +++ /dev/null @@ -1,13 +0,0 @@ -/** - * Copyright 2021 - A Moisson-Franckhauser & UPMEM - */ -#ifndef __SAM_H__ -#define __SAM_H__ - -void open_sam_file(); -//TODO : either reuse this code or delete it -//void write_sam_read(uint64_t genome_pos, uint8_t *code, int8_t *read); -void write_read_mapping(uint64_t genome_pos, uint8_t *code); -void close_sam_file(); - -#endif /* __SAM_H__ */ diff --git a/host/src/sam.c b/host/src/mapping_file.c similarity index 61% rename from host/src/sam.c rename to host/src/mapping_file.c index d11a370..b1ca9f3 100644 --- a/host/src/sam.c +++ b/host/src/mapping_file.c @@ -6,15 +6,15 @@ #include #include "index.h" -#include "sam.h" +#include "mapping_file.h" #include "genome.h" #include "common.h" #include "parse_args.h" #include "processread.h" #include "debug.h" -#define SAM_FILENAME "read_alignments.sam" -#define SAM_VERSION "1.6" +#define MAP_FILENAME "read_alignments.map" +#define MAP_VERSION "1.6" #define PROGRAM_NAME "upvc" #define MAX_CIGAR_LENGTH (3*SIZE_READ) #define MAX_PATCH_LENGTH (3*SIZE_READ) @@ -24,38 +24,37 @@ #define CIGAR_INSERT 'I' #define CIGAR_DELETE 'D' -FILE *sam_file; +FILE *mapping_file; -static char *get_sam_filename() +static char *get_mapping_filename() { static char filename[FILENAME_MAX]; - sprintf(filename, "%s", SAM_FILENAME); - LOG_TRACE("sam filename : \"%s\"\n", filename); + sprintf(filename, "%s", MAP_FILENAME); + LOG_TRACE("mapping filename : \"%s\"\n", filename); return filename; } -void open_sam_file() +void open_mapping_file() { - LOG_DEBUG("opening sam file\n"); + LOG_DEBUG("opening mapping file\n"); // TODO: check for memory leaks here - char *filename = get_sam_filename(); - sam_file = fopen(filename, "w"); - if (sam_file == NULL) + char *filename = get_mapping_filename(); + mapping_file = fopen(filename, "w"); + if (mapping_file == NULL) { - LOG_FATAL("couldn't open sam file; errno : %u\n", errno); + LOG_FATAL("couldn't open mapping file; errno : %u\n", errno); } - LOG_DEBUG("openned sam file : %p\n", sam_file); + LOG_DEBUG("openned mapping file : %p\n", mapping_file); // TODO: complete header - LOG_TRACE("writing sam header\n"); - LOG_DEBUG("written test line in sam file\n"); - fprintf(sam_file, "@HD VN:" SAM_VERSION " SO:unknown\n"); - fprintf(sam_file, "@PG ID:1 PN:" PROGRAM_NAME "\n"); - LOG_DEBUG("sam header written\n"); + LOG_TRACE("writing mapping header\n"); + //fprintf(mapping_file, "@HD VN:" MAP_VERSION " SO:unknown\n"); + //fprintf(mapping_file, "@PG ID:1 PN:" PROGRAM_NAME "\n"); + //LOG_DEBUG("mapping header written\n"); } /* * TODO: Either reuse this code or delete it -void write_sam_read(uint64_t genome_pos, uint8_t *code, int8_t *read) +void write_mapping_read(uint64_t genome_pos, uint8_t *code, int8_t *read) { const unsigned int flag = 0x0; const uint8_t mapping_quality = 255;// unknown quality; TODO: set quality @@ -131,22 +130,33 @@ void write_sam_read(uint64_t genome_pos, uint8_t *code, int8_t *read) sequence[SIZE_READ] = 0; //TODO: set read quality correctly (last string) - fprintf(sam_file, "*\t%u\t*\t%lu\t%u\t%s\t*\t0\t%d\t%s\t*\n", flag, genome_pos+1, mapping_quality, cigar, tlen, sequence); + fprintf(mapping_file, "*\t%u\t*\t%lu\t%u\t%s\t*\t0\t%d\t%s\t*\n", flag, genome_pos+1, mapping_quality, cigar, tlen, sequence); } */ -void write_read_mapping(uint64_t genome_pos, uint8_t *code) { +void write_read_mapping(char *chromosome_name, uint64_t genome_pos, uint8_t *code) { char patch[MAX_PATCH_LENGTH]; int patch_idx=0; int nucleotides_read=0; uint32_t code_idx; char nucleotide[4]= {'A', 'C', 'T', 'G'}; + uint8_t last_action = CODE_INS; for (code_idx=0; code[code_idx] < CODE_END;) { uint8_t action = code[code_idx++]; - uint8_t position = code[code_idx++]; - uint8_t letter = code[code_idx++] && 0x3; + uint8_t position = nucleotides_read; + uint8_t letter='E'; + if (action>3) + { + position = code[code_idx++]; + if (code[code_idx] < 5) + { + letter = nucleotide[code[code_idx++]]; + } else { + letter = nucleotide[(code[code_idx++] && 0x6)>>1]; + } + } for (;nucleotides_read> 1]; + } + switch (last_action) + { + case CODE_SUB: + patch[patch_idx++] = letter|0x20;//lowercase + nucleotides_read++; + break; + case CODE_DEL: + patch[patch_idx++] = '/'; + break; + case CODE_INS: + patch[patch_idx++] = letter; + nucleotides_read++; + break; + } } } if (code[code_idx] == CODE_ERR) @@ -183,11 +215,11 @@ void write_read_mapping(uint64_t genome_pos, uint8_t *code) { patch[patch_idx++] = '='; } patch[patch_idx++] = 0&&code; - fprintf(sam_file, "%lu\t%s\n", genome_pos, patch); + fprintf(mapping_file, "%s\t%lu\t%s\n", chromosome_name, genome_pos, patch); } -void close_sam_file() +void close_mapping_file() { - LOG_TRACE("closing sam file\n"); - fclose(sam_file); + LOG_TRACE("closing mapping file\n"); + fclose(mapping_file); } diff --git a/host/src/processread.c b/host/src/processread.c index 7d152be..ed81e0b 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -15,7 +15,7 @@ #include "genome.h" #include "getread.h" #include "processread.h" -#include "sam.h" +#include "mapping_file.h" #include "upvc.h" #include "vartree.h" @@ -31,7 +31,11 @@ #define PQD_INIT_VAL (99) -#define MAX_SUBSTITUTION (4) +#if SIZE_READ>120 +#define MAX_SUBSTITUTION 31 +#else +#define MAX_SUBSTITUTION 20 +#endif static bool flag_dbg = false; @@ -244,29 +248,25 @@ static int code_alignment(uint8_t *code, int score, int8_t *gen, int8_t *read, u backtrack_idx--; } else { if (backtrak[backtrack_idx].type == CODE_DEL) { - //int backtrack_jx = backtrak[backtrack_idx].jx; + int backtrack_jx = backtrak[backtrack_idx].jx; code[code_idx++] = CODE_DEL; code[code_idx++] = backtrak[backtrack_idx].ix; code[code_idx++] = gen[backtrak[backtrack_idx].ix] & 3; backtrack_idx--; - /* while ((backtrak[backtrack_idx].type == CODE_DEL) && (backtrack_jx == backtrak[backtrack_idx].jx)) { code[code_idx++] = gen[backtrak[backtrack_idx].ix] & 3; backtrack_idx--; } - */ } else { - //int backtrack_ix = backtrak[backtrack_idx].ix; + int backtrack_ix = backtrak[backtrack_idx].ix; code[code_idx++] = CODE_INS; code[code_idx++] = backtrak[backtrack_idx].jx - 1; code[code_idx++] = read[backtrak[backtrack_idx].jx]; backtrack_idx--; - /* while ((backtrak[backtrack_idx].type == CODE_INS) && (backtrack_ix == backtrak[backtrack_idx].ix)) { code[code_idx++] = read[backtrak[backtrack_idx].jx]; backtrack_idx--; } - */ } } } @@ -429,11 +429,13 @@ int get_read_update_positions( __attribute__((unused))int size_neighbour_in_symbols, bool * flag, bool debug, - uint32_t * substCnt) { + uint32_t * substCnt, + char * chromosome_name) { // run smith and waterman algorithm to find indels uint8_t code_result_tab[256]; code_alignment(code_result_tab, result_tab[pos].score, &ref_genome->data[genome_pos], read, size_neighbour_in_symbols, flag); + write_read_mapping(chromosome_name, result_tab[pos].coord.seed_nr, code_result_tab); for(int read_pos = 0; read_pos < SIZE_READ; ++read_pos) { update_genome_position[read_pos] = 0; } @@ -486,7 +488,7 @@ int get_read_update_positions( while (pos_variant_read <= ps_var_read) { // position should not be updated yet if(update_genome_position[pos_variant_read] != 0) { - printf("Warning: duplicate update (Insertion) at position %lu. Current %lu\n", + LOG_WARN("duplicate update (Insertion) at position %lu. Current %lu\n", pos_variant_read, update_genome_position[pos_variant_read]); fflush(stdout); return -1; @@ -519,7 +521,7 @@ int get_read_update_positions( if(pos_variant_read + 1 < SIZE_READ) { // position should not be updated yet if(update_genome_position[pos_variant_read+1] != 0) { - printf("Warning: duplicate update (Deletion) at position %lu. Current %lu\n", + LOG_WARN("duplicate update (Deletion) at position %lu. Current %lu\n", pos_variant_read+1, update_genome_position[pos_variant_read+1]); fflush(stdout); return -1; @@ -613,7 +615,7 @@ bool update_frequency_table( pthread_mutex_lock(&freq_table_mutex); int nbIndels = get_read_update_positions(update_genome_position, result_tab, pos, - ref_genome, genome_pos, read, size_neighbour_in_symbols, &flag_dbg, debug, &substCnt); + ref_genome, genome_pos, read, size_neighbour_in_symbols, &flag_dbg, debug, &substCnt, ref_genome->seq_name[result_tab[pos].coord.seq_nr]); bool hasIndel = nbIndels > 0; // debug prints @@ -668,7 +670,7 @@ bool update_frequency_table( } else if(debug) { if(!result_tab[pos].coord.nodp) { - printf("\nWarning: odpd result with no indels detected (flag = %d, subst cnt %u)):\n", flag_dbg, substCnt); + LOG_WARN("odpd result with no indels detected (flag = %d, subst cnt %u)):\n", flag_dbg, substCnt); printf("Read:\n"); for(int k = 0; k < SIZE_READ; ++k) { printf("%c", nucleotide[read[k]]); @@ -693,7 +695,7 @@ bool update_frequency_table( frequency_table[read[k]][update_genome_pos].score++; } else if (update_genome_pos != UINT64_MAX) - printf("WARNING: genome update position computed is wrong %lu\n", update_genome_pos); + LOG_WARN("genome update position computed is wrong %lu\n", update_genome_pos); } } /*fflush(stdout);*/ @@ -708,7 +710,7 @@ bool update_frequency_table( frequency_table[read[j]][genome_pos+j].score++; } else - printf("WARNING: reads matched at position that exceeds genome size\n"); + LOG_WARN("reads matched at position that exceeds genome size\n"); } pthread_mutex_unlock(&freq_table_mutex); return false; @@ -867,7 +869,7 @@ static void do_process_read(process_read_arg_t *arg) int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((2 * (MAX_SUBSTITUTION + 1)) - delta); if(delta_corrected < 0) { - printf("WARNING: negative delta for square root %d\n", delta_corrected); + LOG_WARN("negative delta for square root %d\n", delta_corrected); } else if(delta > DIST_PAIR_THRESHOLD) { mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); @@ -885,7 +887,7 @@ static void do_process_read(process_read_arg_t *arg) #ifdef USE_MAPQ_SCORE int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((2 * (MAX_SUBSTITUTION + 1)) - delta); if(delta_corrected < 0) { - printf("WARNING: negative delta (np == 1) for square root %d\n", delta_corrected); + LOG_WARN("negative delta (np == 1) for square root %d\n", delta_corrected); } else if(delta > DIST_PAIR_THRESHOLD) { mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); @@ -927,7 +929,7 @@ static void do_process_read(process_read_arg_t *arg) int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); if(delta_corrected < 0) { - printf("WARNING: negative delta (np1 == 2) for square root %d\n", delta_corrected); + LOG_WARN("negative delta (np1 == 2) for square root %d\n", delta_corrected); } else if(delta > DIST_SINGLE_THRESHOLD) { mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); @@ -946,7 +948,7 @@ static void do_process_read(process_read_arg_t *arg) int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); if(delta_corrected < 0) { - printf("WARNING: negative delta (np1 == 1) for square root %d\n", delta_corrected); + LOG_WARN("negative delta (np1 == 1) for square root %d\n", delta_corrected); } else if(delta > DIST_SINGLE_THRESHOLD) { mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); @@ -967,7 +969,7 @@ static void do_process_read(process_read_arg_t *arg) int delta_corrected = MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); if(delta_corrected < 0) { - printf("WARNING: negative delta (np2 == 2) for square root %d, %d %d %d %d\n", + LOG_WARN("negative delta (np2 == 2) for square root %d, %d %d %d %d\n", delta_corrected, MISMATCH_COUNT(result_tab[P2[0]]), MISMATCH_COUNT(result_tab[P2[1]]), MAX_SUBSTITUTION + 1, delta); } else if(delta > DIST_SINGLE_THRESHOLD) { @@ -988,7 +990,7 @@ static void do_process_read(process_read_arg_t *arg) int delta_corrected = MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); if(delta_corrected < 0) { - printf("WARNING: negative delta (np2 == 1) for square root %d\n", delta_corrected); + LOG_WARN("negative delta (np2 == 1) for square root %d\n", delta_corrected); } else if (delta > DIST_SINGLE_THRESHOLD) { mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); @@ -1058,7 +1060,7 @@ static void *process_read_thread_fct(void *arg) void process_read_init() { #if DEBUG_READ_MAPPING - open_sam_file(); + open_mapping_file(); #endif genome_t *ref_genome = genome_get(); args.ref_genome = ref_genome; @@ -1076,7 +1078,7 @@ void process_read_init() void process_read_free() { #if DEBUG_READ_MAPPING - close_sam_file(); + close_mapping_file(); #endif stop_threads = true; pthread_barrier_wait(&barrier); diff --git a/tests/igvlike-focus.py b/tests/igvlike-focus.py new file mode 100755 index 0000000..0e861f3 --- /dev/null +++ b/tests/igvlike-focus.py @@ -0,0 +1,188 @@ +#! /usr/bin/python3 +import argparse +import itertools +import mmap +import sys + +verbose = False + +def main(arguments): + global verbose + if arguments.verbosity: + verbose = len(arguments.verbosity) + chromosome = "chr"+str(arguments.chr) + start = arguments.index-arguments.context + end = arguments.index+arguments.context+1 + if arguments.before != None: + start = arguments.index-arguments.before + if arguments.after != None: + end = arguments.index+arguments.after + fasta_file_name = None + mapping_file_names = [] + for f in arguments.files: + extension = f.split(".")[-1] + if extension == "fasta": + if fasta_file_name == None: + fasta_file_name = f + else: + print("Can't handle more than one fasta file") + return 1 + elif extension == "map": + mapping_file_names.append(f) + else: + print("unknown file extension :", extension) + print("for :", f) + print("please only use fasta and map files"); + + insertions = [0]*(end-start) + block = [] + + if fasta_file_name != None: + block.append(list(zip(range(start, end), get_genome_part(fasta_file_name, chromosome, start, end, arguments.index)))) + + if verbose: + print("parsing intersecting mappings") + for mapping_start_address, mapping_string in get_intersecting_mappings(mapping_file_names, chromosome, start, end, arguments.max_read_size): + if verbose>1: + print("found mapping at :", mapping_start_address) + this_line_insertions = [0]*(end-start+60) + block.append([]) + address = max(start-1, mapping_start_address-1) + i = address-mapping_start_address+1 + while address": + current_chr = line[1:].strip() + continue + endline_address = address+line_length + if current_chr == chromosome and endline_address>start: + for i in range(max(start-address, 0), min(end-address, line_length)): + if address+i == index-1: + yield line[i].upper() + else: + yield line[i].lower() + if endline_address>end: + return + address = endline_address + +def get_intersecting_mappings(file_names, chromosome, start, end, max_read_size): + for file_name in file_names: + with get_index(file_name) as index_file: + with open(file_name, mode="r", encoding="utf-8") as map_file: + with mmap.mmap(map_file.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_map_file: + last_address=0 + last_chr = None + current_chr = None + address = 0 + for line in index_file.readlines(): + last_address = address + last_chr = current_chr + current_chr, index, address = line.split() + index = int(index) + address = int(address) + if current_chr == chromosome and index >= start: + break + mmap_map_file.seek(int(last_address)) + yielded_line = False + for line in mmap_readlines(mmap_map_file): + if line == b'': + continue + current_chr = line.split()[0].decode() + index = int(line.split()[1]) + if not(yielded_line) and index+max_read_sizeend-1: + break + if current_chr!=chromosome and yielded_line: + break + +def mmap_readlines(mmap_obj): + while True: + try: + yield mmap_obj.readline() + except: + return + +def get_index(file_name): + try: + return open(file_name+".index", mode="r") + except IOError: + create_index(file_name) + return open(file_name+".index", mode="r") + +def create_index(file_name): + print("creating index for", file_name) + with open(file_name+".index", mode="w") as index_file: + with open(file_name, mode="r", encoding="utf-8") as map_file: + with mmap.mmap(map_file.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_map_file: + if verbose: + print("file openned for indexing") + for i in itertools.count(): + if verbose>1: + print("indexing :", i*10000000) + try: + mmap_map_file.seek(i*10000000) + except ValueError: + return + if i>0: + mmap_map_file.readline() + line_address = mmap_map_file.tell() + line = mmap_map_file.readline().split() + chromosome = line[0] + index = line[1] + index_file.write(chromosome.decode()+"\t"+index.decode()+"\t"+str(line_address)+"\n") + + +if __name__=="__main__": + parser = argparse.ArgumentParser(description="show read mappings around a specific part of the genome.") + parser.add_argument("files", nargs="+", type=str, help="Fasta and map files to read from") + parser.add_argument("-c", "--chr", type=int, dest="chr", help="The chromosome in which to show the read mappings") + parser.add_argument("-i", "--index", type=int, dest="index", help="The address around which to show read mappings") + parser.add_argument("-A", "--after", default=None, type=int, dest='after', help="how many bases to show after address") + parser.add_argument("-B", "--before", default=None, type=int, dest='before', help="how many bases to show before address") + parser.add_argument("-C", "--context", default=40, type=int, dest='context', help="how many bases to show before and after address") + parser.add_argument("-s", "--max-read-size", default=150, type=int, dest="max_read_size", help="the maximum length of a read; preferably exact") + parser.add_argument("-v", action="append_const", dest="verbosity", const=True, help="add verbosity; flag may be set multiple times for more verbosity") + parser.add_argument("-p", "--progressive", action='store_true', dest="progressive", help="print results everytime a new read is found") + main(parser.parse_args()) From 988d57e4e7fdcd5a47dadcbacdd6490c87842e96 Mon Sep 17 00:00:00 2001 From: amoisson Date: Wed, 8 Dec 2021 16:26:24 +0100 Subject: [PATCH 20/48] replaced '=' with nucleotide in read mapping file --- host/inc/mapping_file.h | 2 +- host/src/mapping_file.c | 34 +++++++++++++++++++++++----------- host/src/processread.c | 2 +- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/host/inc/mapping_file.h b/host/inc/mapping_file.h index a3f1679..83e2e7f 100644 --- a/host/inc/mapping_file.h +++ b/host/inc/mapping_file.h @@ -7,7 +7,7 @@ void open_mapping_file(); //TODO : either reuse this code or delete it //void write_mapping_read(uint64_t genome_pos, uint8_t *code, int8_t *read); -void write_read_mapping(char *chromosome_name, uint64_t genome_pos, uint8_t *code); +void write_read_mapping(char *chromosome_name, uint64_t genome_pos, uint8_t *code, uint8_t *read); void close_mapping_file(); #endif /* __SAM_H__ */ diff --git a/host/src/mapping_file.c b/host/src/mapping_file.c index b1ca9f3..a626f85 100644 --- a/host/src/mapping_file.c +++ b/host/src/mapping_file.c @@ -134,18 +134,18 @@ void write_mapping_read(uint64_t genome_pos, uint8_t *code, int8_t *read) } */ -void write_read_mapping(char *chromosome_name, uint64_t genome_pos, uint8_t *code) { +void write_read_mapping(char *chromosome_name, uint64_t genome_pos, uint8_t *code, uint8_t *read) { char patch[MAX_PATCH_LENGTH]; int patch_idx=0; + int read_idx=0; - int nucleotides_read=0; uint32_t code_idx; char nucleotide[4]= {'A', 'C', 'T', 'G'}; uint8_t last_action = CODE_INS; for (code_idx=0; code[code_idx] < CODE_END;) { uint8_t action = code[code_idx++]; - uint8_t position = nucleotides_read; + uint8_t position = read_idx; uint8_t letter='E'; if (action>3) { @@ -157,22 +157,28 @@ void write_read_mapping(char *chromosome_name, uint64_t genome_pos, uint8_t *cod letter = nucleotide[(code[code_idx++] && 0x6)>>1]; } } - for (;nucleotides_read>1]; + } + // patch[patch_idx++] = '='; } switch (action) { case CODE_SUB: patch[patch_idx++] = letter|0x20;//lowercase - nucleotides_read++; + read_idx++; break; case CODE_DEL: patch[patch_idx++] = '/'; break; case CODE_INS: patch[patch_idx++] = letter; - nucleotides_read++; + read_idx++; break; default: //Consider the code read is not an action but a letter associated with the previous action @@ -186,14 +192,14 @@ void write_read_mapping(char *chromosome_name, uint64_t genome_pos, uint8_t *cod { case CODE_SUB: patch[patch_idx++] = letter|0x20;//lowercase - nucleotides_read++; + read_idx++; break; case CODE_DEL: patch[patch_idx++] = '/'; break; case CODE_INS: patch[patch_idx++] = letter; - nucleotides_read++; + read_idx++; break; } } @@ -210,9 +216,15 @@ void write_read_mapping(char *chromosome_name, uint64_t genome_pos, uint8_t *cod LOG_TRACE("code[%u]=%u\n", i, code[i]); } } - for (;nucleotides_read>1]; + } + // patch[patch_idx++] = '='; } patch[patch_idx++] = 0&&code; fprintf(mapping_file, "%s\t%lu\t%s\n", chromosome_name, genome_pos, patch); diff --git a/host/src/processread.c b/host/src/processread.c index ed81e0b..a7303fd 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -435,7 +435,7 @@ int get_read_update_positions( // run smith and waterman algorithm to find indels uint8_t code_result_tab[256]; code_alignment(code_result_tab, result_tab[pos].score, &ref_genome->data[genome_pos], read, size_neighbour_in_symbols, flag); - write_read_mapping(chromosome_name, result_tab[pos].coord.seed_nr, code_result_tab); + write_read_mapping(chromosome_name, result_tab[pos].coord.seed_nr, code_result_tab, (uint8_t*) read); for(int read_pos = 0; read_pos < SIZE_READ; ++read_pos) { update_genome_position[read_pos] = 0; } From 69495341d4dede77a37171e3f7a1ca683dd729c6 Mon Sep 17 00:00:00 2001 From: amoisson Date: Wed, 8 Dec 2021 16:49:53 +0100 Subject: [PATCH 21/48] fixed case issue in mapping file --- host/src/mapping_file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/host/src/mapping_file.c b/host/src/mapping_file.c index a626f85..eaa13d5 100644 --- a/host/src/mapping_file.c +++ b/host/src/mapping_file.c @@ -161,9 +161,9 @@ void write_read_mapping(char *chromosome_name, uint64_t genome_pos, uint8_t *cod { if (read[read_idx]<5) { - patch[patch_idx++] = nucleotide[read[read_idx++]]; + patch[patch_idx++] = nucleotide[read[read_idx++]]|0x20;//lowercase } else { - patch[patch_idx++] = nucleotide[(read[read_idx++] && 0x6)>>1]; + patch[patch_idx++] = nucleotide[(read[read_idx++] && 0x6)>>1]|0x20;//lowercase } // patch[patch_idx++] = '='; } @@ -220,9 +220,9 @@ void write_read_mapping(char *chromosome_name, uint64_t genome_pos, uint8_t *cod { if (read[read_idx]<5) { - patch[patch_idx++] = nucleotide[read[read_idx++]]; + patch[patch_idx++] = nucleotide[read[read_idx++]]|0x20;//lowercase } else { - patch[patch_idx++] = nucleotide[(read[read_idx++] && 0x6)>>1]; + patch[patch_idx++] = nucleotide[(read[read_idx++] && 0x6)>>1]|0x20;//lowercase } // patch[patch_idx++] = '='; } From 98023778db1e30348aaf3777e287535610f8dbb0 Mon Sep 17 00:00:00 2001 From: amoisson Date: Fri, 10 Dec 2021 13:56:20 +0100 Subject: [PATCH 22/48] made host-side filters more permissive --- common/inc/common.h | 2 +- dpu/src/task.c | 4 ++-- host/src/processread.c | 29 +++++++++++++++++++++-------- host/src/vartree.c | 10 ++++++++++ 4 files changed, 34 insertions(+), 11 deletions(-) diff --git a/common/inc/common.h b/common/inc/common.h index 0e66dc1..e1a1cbd 100644 --- a/common/inc/common.h +++ b/common/inc/common.h @@ -134,6 +134,6 @@ typedef struct { #define MAPQ_SCALING_FACTOR 2 #define READ_DIST_LOWER_BOUND 50 #define READ_DIST_UPPER_BOUND 2000 -#define depth_filter depth_filter3 +#define depth_filter depth_filter_permissive #endif /* __COMMON_H__ */ diff --git a/dpu/src/task.c b/dpu/src/task.c index 5b0ab5e..f349f4c 100644 --- a/dpu/src/task.c +++ b/dpu/src/task.c @@ -56,9 +56,9 @@ __host dpu_compute_time_t DPU_COMPUTE_TIME_VAR; * @brief Maximum score allowed. */ #if SIZE_READ>120 -#define MAX_SCORE 340 +#define MAX_SCORE 294 #else -#define MAX_SCORE 225 +#define MAX_SCORE 144 #endif /** diff --git a/host/src/processread.c b/host/src/processread.c index a7303fd..d61952d 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -37,6 +37,8 @@ #define MAX_SUBSTITUTION 20 #endif +#define MAX_SCORE_DIFFERENCE_WITH_BEST 40 + static bool flag_dbg = false; typedef struct { @@ -826,6 +828,14 @@ static void do_process_read(process_read_arg_t *arg) } release_curr_match(j); + unsigned int best_individual_scores[4] = {UINT32_MAX}; + for (unsigned int x=i; x result_tab[x].score) + best_individual_scores[t] = result_tab[x].score; + } + // i = start index in result_tab // j = stop index in result_tab // select best couples of paired reads @@ -838,14 +848,17 @@ static void do_process_read(process_read_arg_t *arg) for (unsigned int x1 = i; x1 < j; x1++) { t1 = result_tab[x1].num % 4; pos1 = result_tab[x1].coord.seed_nr; - for (unsigned int x2 = i + 1; x2 < j; x2++) { - pos2 = result_tab[x2].coord.seed_nr; - t2 = result_tab[x2].num % 4; - if (t1 + t2 == 3) // select significant pair - { - if ((abs((int)pos2 - (int)pos1) > READ_DIST_LOWER_BOUND && (abs((int)pos2 - (int)pos1) < READ_DIST_UPPER_BOUND))) { - // update if this is one of the two best scores - keep_best_2_scores(result_tab[x1].score + result_tab[x2].score, P1, P2, x1, x2, best_score); + if (result_tab[x1].score < best_individual_scores[t1] + MAX_SCORE_DIFFERENCE_WITH_BEST) + { + for (unsigned int x2 = i + 1; x2 < j; x2++) { + pos2 = result_tab[x2].coord.seed_nr; + t2 = result_tab[x2].num % 4; + if (t1 + t2 == 3) // select significant pair + { + if ((abs((int)pos2 - (int)pos1) > READ_DIST_LOWER_BOUND && (abs((int)pos2 - (int)pos1) < READ_DIST_UPPER_BOUND))) { + // update if this is one of the two best scores + keep_best_2_scores(result_tab[x1].score + result_tab[x2].score, P1, P2, x1, x2, best_score); + } } } } diff --git a/host/src/vartree.c b/host/src/vartree.c index 1197b85..51906c0 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -260,6 +260,16 @@ __attribute__((unused)) uint32_t depth_filter_a(float freq) { return UINT_MAX; } +__attribute__((unused)) uint32_t depth_filter_permissive(float freq) { + if (freq< 10.0f) { + return 3; + } + if (freq<20.0f) { + return 2; + } + return 1; +} + __attribute__((unused)) uint32_t depth_filter_fixed_3(float freq) { if(freq < 20.0f) From f5dc5d62eb2a0b190b75280ccbd86a8580fafb05 Mon Sep 17 00:00:00 2001 From: amoisson Date: Fri, 10 Dec 2021 14:51:36 +0100 Subject: [PATCH 23/48] added a whole bunch of comments in DPD and DPD_compute functions --- host/src/processread.c | 124 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 123 insertions(+), 1 deletion(-) diff --git a/host/src/processread.c b/host/src/processread.c index d61952d..6f6512a 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -52,13 +52,33 @@ static int min(int a, int b) { return a < b ? a : b; } static void DPD_compute( int s1, int s2, int *Dij, int Dijm, int Dimj, int Dimjm, int *Pij, int Pijm, int *Qij, int Qimj, int *xij) { + /* Compute values for D,P,Q and x at (i,j) from values at (i-1,j), (i,j-1), (i-1,j-1). + * i represents relative index in reference genome (starting from where the read is mapped). + * j represents index in read. + * Dij is the minimum cost to align the first i nucleotides from genome with the first j nucleotides from read. + * Pij is the same minimum cost but assuming the last operation was an insertion. + * Qij is the same minimum cost but assuming the last operation was a deletion. + * xij is the chosen operation to reach the cost in Dij : + * 0 means read and reference genome match + * 1 is for a substitution + * 2 is for an insertion + * 3 is for a deletion + */ int min_QP, d; + // Compute cost in case of insertion : + // D_i_j-1 + COST_GAPO in case of first insertion + // D_i_j-1 + COST_GAPE in case insertion directly follows another insertion (In this case, D_i_j-1 = P_i_j-1) *Pij = min(Dijm + COST_GAPO, Pijm + COST_GAPE); + // Similar for deletion : *Qij = min(Dimj + COST_GAPO, Qimj + COST_GAPE); + + //x_i_j is the backtracking *xij = 0; int x; + // Get minimum between Pij (which assumes insertion) and Qij (which assumes deletion) + // and store the associated operation in x (2 for insertion and 3 for deletion). if (*Pij < *Qij) { min_QP = *Pij; x = 2; @@ -66,11 +86,18 @@ static void DPD_compute( min_QP = *Qij; x = 3; } + + // Compute the diagonal score in d : + // Dimjm (D[i-1][j-1]) if genome and read match (genome[read_map_address+SIZE_SEED+i]==read[j]) + // Dimjm + COST_SUB if genome and read do not match (genome[read_map_address+SIZE_SEED+i]!=read[j]) d = Dimjm; if ((s1 & 3) != (s2 & 3)) { d += COST_SUB; *xij = 1; } + + // If diagonal score is best, store it in Dij (in this case, xij has already been set to the correct value) + // Otherwise, set Dij to the best score between insertion and deletion (and set xij correspondingly). if (d < min_QP) { *Dij = d; } else { @@ -81,22 +108,36 @@ static void DPD_compute( int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_symbols) { + // Matrices are only computed for neighbours (ie: nucleotides not in the seed). int matrix_size = size_neighbour_in_symbols + 1; + // Only NB_DIAG (odd) diagonals are computed for each matrix. Values outside this diagonal aren't considered. + // ie: we only consider indices where j-diagonal < i < j+diagonal int diagonal = (NB_DIAG / 2) + 1; + // D is the matrix of scores. + // P is the matrix of scores assuming last operation is an insertion. + // Q is the matrix of scores assuming last operation is a deletion. + // X is the matrix of actual operations used for backtracking. int D[matrix_size][matrix_size]; int P[matrix_size][matrix_size]; int Q[matrix_size][matrix_size]; - int X[matrix_size][matrix_size]; + int X[matrix_size][matrix_size]; int min_score = PQD_INIT_VAL; int min_score_i_idx = 0; int min_score_j_idx = 0; int align_distance = 1; + // Set D matrix to 0 + // FIXME : I'm pretty sure this is completely useless as those 0s should be written over before being read. for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { D[i][j] = 0; } } + + // Set first row and column of P and Q to high values + // Set first row and column of D to correct values + // FIXME : It would probably make more sense to set D[i][0] and D[0][i] to i*COST_GAPO + // but this should also be changed accordingly in DPU then. for (int i = 0; i <= diagonal; i++) { P[i][0] = PQD_INIT_VAL; P[0][i] = PQD_INIT_VAL; @@ -106,6 +147,21 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy D[0][i] = i * COST_SUB; } + /* D matrix at this point : (assuming COST_SUB=1 and diagonal = 3 ie: NB_DIAG=5) + * (0,0) in bottom left + * + * | + * | + * |3 + * ^|2 + * ||1 + * j|0 1 2 3 + * +-------------- + * i-> + */ + + + // Compute first triangle (bottom left part) for (int i = 1; i < diagonal; i++) { for (int j = 1; j < i + diagonal; j++) { DPD_compute(s1[i - 1], s2[j - 1], &D[i][j], D[i][j - 1], D[i - 1][j], D[i - 1][j - 1], &P[i][j], P[i][j - 1], @@ -114,6 +170,25 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy Q[i][i + diagonal] = PQD_INIT_VAL; D[i][i + diagonal] = PQD_INIT_VAL; } + + /* D matrix at this point : (assuming COST_SUB=1 and diagonal = 3 ie: NB_DIAG=5) + * (0,0) in bottom left + * An x corresponds to a value that has been computed. + * A M corresponds to PQD_INIT_VAL + * + * | + * | + * | M + * | M x + * |3 x x + * ^|2 x x + * ||1 x x + * j|0 1 2 3 + * +-------------- + * i-> + */ + + // Compute most of the diagonal for (int i = diagonal; i < matrix_size - diagonal; i++) { P[i][i - diagonal] = PQD_INIT_VAL; D[i][i - diagonal] = PQD_INIT_VAL; @@ -125,6 +200,29 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy D[i][i + diagonal] = PQD_INIT_VAL; } + /* D matrix at this point : (assuming COST_SUB=1, diagonal = 3 ie: NB_DIAG=5, and matrix_size=12) + * (0,0) in bottom left + * An x corresponds to a value that has been computed. + * + * +-----------------------+ + * | M | + * | M x | + * | M x x | + * | M x x x | + * | M x x x x | + * | M x x x x x | + * | M x x x x x M | + * | M x x x x x M | + * |3 x x x x x M | + * ^|2 x x x x M | + * ||1 x x x M | + * j|0 1 2 M | + * +-----------------------+ + * i-> + */ + + // Compute last triangle (top right part) + // (And check for best score in top-most row of D matrix) for (int i = matrix_size - diagonal; i < matrix_size; i++) { P[i][i - diagonal] = PQD_INIT_VAL; D[i][i - diagonal] = PQD_INIT_VAL; @@ -138,6 +236,29 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy min_score_j_idx = matrix_size - 1; } } + + /* D matrix at this point : (assuming COST_SUB=1, diagonal = 3 ie: NB_DIAG=5, and matrix_size=12) + * (0,0) in bottom left + * An x corresponds to a value that has been computed. + * + * +-----------------------+ + * | M x x x| + * | M x x x x| + * | M x x x x x| + * | M x x x x x M| + * | M x x x x x M | + * | M x x x x x M | + * | M x x x x x M | + * | M x x x x x M | + * |3 x x x x x M | + * ^|2 x x x x M | + * ||1 x x x M | + * j|0 1 2 M | + * +-----------------------+ + * i-> + */ + + // Find the best score in right-most column of D matrix (if it is better than the best from the top row) for (int j = matrix_size - diagonal; j < matrix_size; j++) { if (D[matrix_size - 1][j] < min_score) { min_score = D[matrix_size - 1][j]; @@ -146,6 +267,7 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy } } + // backtrack step { int i = min_score_i_idx; int j = min_score_j_idx; From 50eacea53aeaf65e01aeea7f6e780910e3914db1 Mon Sep 17 00:00:00 2001 From: amoisson Date: Fri, 14 Jan 2022 16:11:58 +0100 Subject: [PATCH 24/48] modified processread giving 93% performance on chr22 --- host/inc/debug.h | 2 +- host/inc/mapping_file.h | 3 + host/inc/processread.h | 7 + host/inc/profiling.h | 70 ++++ host/src/mapping_file.c | 94 ++--- host/src/processread.c | 889 ++++++++++++++-------------------------- host/src/upvc.c | 47 ++- host/src/vartree.c | 50 +++ 8 files changed, 502 insertions(+), 660 deletions(-) create mode 100644 host/inc/profiling.h diff --git a/host/inc/debug.h b/host/inc/debug.h index 7c3dfea..4ab0fe1 100644 --- a/host/inc/debug.h +++ b/host/inc/debug.h @@ -6,7 +6,7 @@ #define V_DEBUG 5 #define V_TRACE 6 -#define VERBOSE V_TRACE +#define VERBOSE V_INFO #define VERBOSE_COLORS true #define VERBOSE_LOG_LEVEL true diff --git a/host/inc/mapping_file.h b/host/inc/mapping_file.h index 83e2e7f..664287b 100644 --- a/host/inc/mapping_file.h +++ b/host/inc/mapping_file.h @@ -4,9 +4,12 @@ #ifndef __SAM_H__ #define __SAM_H__ +#include "processread.h" + void open_mapping_file(); //TODO : either reuse this code or delete it //void write_mapping_read(uint64_t genome_pos, uint8_t *code, int8_t *read); +void write_read_mapping_from_backtrack(char *chromosome_name, uint64_t genome_pos, backtrack_t *backtrack_end, int8_t *read); void write_read_mapping(char *chromosome_name, uint64_t genome_pos, uint8_t *code, uint8_t *read); void close_mapping_file(); diff --git a/host/inc/processread.h b/host/inc/processread.h index 09ad7d8..ded0249 100644 --- a/host/inc/processread.h +++ b/host/inc/processread.h @@ -7,6 +7,13 @@ #include +typedef struct { + int type; + int ix; + int jx; +} backtrack_t; + + #define CODE_SUB 10 #define CODE_DEL 11 #define CODE_INS 12 diff --git a/host/inc/profiling.h b/host/inc/profiling.h new file mode 100644 index 0000000..52c100e --- /dev/null +++ b/host/inc/profiling.h @@ -0,0 +1,70 @@ +#ifndef __PROFILING_H__ +#define __PROFILING_H__ + +#include + +#define STAT_MAX_SUBSTEPS 10 + +struct time_stat_t { + clock_t total_time; + unsigned int number_calls; + clock_t substep_total_times[STAT_MAX_SUBSTEPS]; +}; + +struct time_stat_t profiling[14]; + +#define STAT_DPD 0 +#define STAT_CODE_ALIGNMENT 1 +#define STAT_ADD_TO_NON_MAPPED_READ 2 +#define STAT_GET_READ_UPDATE_POSITIONS 3 +#define STAT_UPDATE_FREQUENCY_TABLE 4 +#define STAT_DO_PROCESS_READ 5 +#define STAT_PROCESS_READ 6 +#define STAT_EXEC_ROUND 7 +#define STAT_EXEC_DPUS 8 +#define STAT_THREAD_GET_READS 9 +#define STAT_THREAD_DISPATCH 10 +#define STAT_THREAD_ACC 11 +#define STAT_THREAD_PROCESS 12 +#define STAT_DO_MAPPING 13 + +#define STAT_RECORD_START(FUNCTION) \ + clock_t profiling_step_time, profiling_last_step_time; \ + clock_t profiling_start_time = clock(); \ + profiling_last_step_time = profiling_start_time; \ + profiling[FUNCTION].number_calls++; + +#define STAT_RECORD_STEP(FUNCTION, STEP_N) \ + profiling_step_time = clock(); \ + profiling[FUNCTION].substep_total_times[STEP_N] += profiling_step_time-profiling_last_step_time; \ + profiling_last_step_time = profiling_step_time; + +#define STAT_RECORD_LAST_STEP(FUNCTION, STEP_N) \ + STAT_RECORD_STEP(FUNCTION, STEP_N) \ + profiling[FUNCTION].total_time += profiling_last_step_time-profiling_start_time; + + +#define PRINT_MICROSECONDS(t) \ + if (t<1000000) { \ + printf("%ld.%ldms", t/1000, t%1000); \ + } else { \ + printf("%ld.%lds", t/1000000, (t/1000)%1000); \ + } + +#define PRINT_FUNCTION_STAT(FUNCTION) \ + printf(#FUNCTION ":\n"); \ + printf("\tcalled: %u\n", profiling[FUNCTION].number_calls); \ + printf("\ttotal time:"); \ + PRINT_MICROSECONDS(profiling[FUNCTION].total_time) \ + printf("\n\tsteps:\n"); \ + for (int i=0; i 0) { \ + printf("\t\t"); \ + PRINT_MICROSECONDS(profiling[FUNCTION].substep_total_times[i]) \ + printf("\n"); \ + } \ + } + + +#endif /* __PROFILING_H__ */ diff --git a/host/src/mapping_file.c b/host/src/mapping_file.c index eaa13d5..06aaeba 100644 --- a/host/src/mapping_file.c +++ b/host/src/mapping_file.c @@ -52,87 +52,39 @@ void open_mapping_file() //LOG_DEBUG("mapping header written\n"); } -/* - * TODO: Either reuse this code or delete it -void write_mapping_read(uint64_t genome_pos, uint8_t *code, int8_t *read) +void write_read_mapping_from_backtrack(char *chromosome_name, uint64_t genome_pos, backtrack_t *backtrack_end, int8_t *read) { - const unsigned int flag = 0x0; - const uint8_t mapping_quality = 255;// unknown quality; TODO: set quality - - char cigar[MAX_CIGAR_LENGTH]; - uint32_t cigar_idx = 0; - char last_code = 0; - unsigned int code_count = 0; - int last_position = -1; - int tlen = 0;//TODO: figure out what tlen is supposed to be - - char sequence[SIZE_READ+1]; + char patch[MAX_PATCH_LENGTH]; + int patch_idx=MAX_PATCH_LENGTH; + patch[--patch_idx] = 0; + char nucleotide[4] = {'A', 'C', 'T', 'G'}; - - for (uint32_t code_idx=0; code[code_idx] != CODE_END;) - { - char new_code=0; - int new_position=last_position+1; - switch (code[code_idx++]) - { + + uint8_t read_letter; + for (;backtrack_end->type != CODE_END; backtrack_end--) { + read_letter = read[backtrack_end->jx]; + if (read_letter < 4) { + read_letter = nucleotide[read_letter]; + } else { + read_letter = read_letter & (!0x20); + } + switch (backtrack_end->type) { + case 0: + patch[--patch_idx] = '-'; + break; case CODE_SUB: - new_code = CIGAR_MISMATCH; - new_position = code[code_idx++]; - code_idx++; + patch[--patch_idx] = read_letter | 0x20; break; case CODE_INS: - new_code = CIGAR_INSERT; - new_position = code[code_idx++]; - code_idx++; + patch[--patch_idx] = read_letter; break; case CODE_DEL: - new_code = CIGAR_DELETE; - new_position = code[code_idx++]; - code_idx++; + patch[--patch_idx] = '/'; break; } - if (last_code != new_code) - { - if (code_count>0) - { - cigar_idx += sprintf(cigar+cigar_idx, "%u%c", code_count, last_code); - } - last_code = new_code; - code_count = 0; - } - if (new_position > last_position+1) - { - if (code_count > 0) - { - cigar_idx += sprintf(cigar+cigar_idx, "%u%c", code_count, last_code); - code_count = 0; - } - cigar_idx += sprintf(cigar+cigar_idx, "%u%c", new_position-last_position-1, CIGAR_MATCH); - } - code_count++; - last_position = new_position; - } - if (code_count > 0) - { - cigar_idx += sprintf(cigar+cigar_idx, "%u%c", code_count, last_code); } - if (SIZE_READ > last_position+1) - { - cigar_idx += sprintf(cigar+cigar_idx, "%u%c", SIZE_READ-last_position-1, CIGAR_MATCH); - } - cigar[cigar_idx] = 0;//ensure last string character is 0 - LOG_TRACE("cigar : \"%s\"\n", cigar); - - for (uint32_t i=0; i>1)&3 54H 0101 0100 */ #define CODE_G 3 /* ('G'>>1)&3 47H 0100 0111 */ -#define PQD_INIT_VAL (99) +#define PQD_INIT_VAL (999) #if SIZE_READ>120 #define MAX_SUBSTITUTION 31 @@ -39,16 +41,22 @@ #define MAX_SCORE_DIFFERENCE_WITH_BEST 40 -static bool flag_dbg = false; +/* +static void log_nucleotides(int8_t *s, int max_len) { + char nucleotides[4] = {'A', 'C', 'T', 'G'}; + for (int i=0; itype = CODE_SUB; + (*backtrack_end)->ix = i; + (*backtrack_end)->jx = i; + (*backtrack_end)++; + score += COST_SUB; + } else { + (*backtrack_end)->type = 0; + (*backtrack_end)->ix = i; + (*backtrack_end)->jx = i; + (*backtrack_end)++; + } + } + return score; +} + +int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, backtrack_t ** backtrack_end) { + STAT_RECORD_START(STAT_DPD); + /* s1 is a pointer to the genome where the end of the seed was mapped + * s2 is a pointer to the end of the seed in the read + * backtrack is a data structure that will be populated by this function with a sequence of operations + * (insertions, deletions, substitutions or match) paired with their corresponding indices in genome and read + */ // Matrices are only computed for neighbours (ie: nucleotides not in the seed). - int matrix_size = size_neighbour_in_symbols + 1; + int matrix_size = SIZE_READ + 1; // Only NB_DIAG (odd) diagonals are computed for each matrix. Values outside this diagonal aren't considered. // ie: we only consider indices where j-diagonal < i < j+diagonal int diagonal = (NB_DIAG / 2) + 1; @@ -121,10 +157,11 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy int P[matrix_size][matrix_size]; int Q[matrix_size][matrix_size]; int X[matrix_size][matrix_size]; + // Best score found at the end of computation (upper row/right-most column of D matrix) int min_score = PQD_INIT_VAL; + // Indices of best score found in the upper-right row/column of D matrix int min_score_i_idx = 0; int min_score_j_idx = 0; - int align_distance = 1; // Set D matrix to 0 // FIXME : I'm pretty sure this is completely useless as those 0s should be written over before being read. @@ -147,6 +184,7 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy D[0][i] = i * COST_SUB; } + STAT_RECORD_STEP(STAT_DPD, 0); /* D matrix at this point : (assuming COST_SUB=1 and diagonal = 3 ie: NB_DIAG=5) * (0,0) in bottom left * @@ -170,11 +208,12 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy Q[i][i + diagonal] = PQD_INIT_VAL; D[i][i + diagonal] = PQD_INIT_VAL; } + STAT_RECORD_STEP(STAT_DPD, 1); /* D matrix at this point : (assuming COST_SUB=1 and diagonal = 3 ie: NB_DIAG=5) * (0,0) in bottom left * An x corresponds to a value that has been computed. - * A M corresponds to PQD_INIT_VAL + * An M corresponds to PQD_INIT_VAL * * | * | @@ -199,10 +238,12 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy Q[i][i + diagonal] = PQD_INIT_VAL; D[i][i + diagonal] = PQD_INIT_VAL; } + STAT_RECORD_STEP(STAT_DPD, 2); /* D matrix at this point : (assuming COST_SUB=1, diagonal = 3 ie: NB_DIAG=5, and matrix_size=12) * (0,0) in bottom left * An x corresponds to a value that has been computed. + * An M corresponds to PQD_INIT_VAL * * +-----------------------+ * | M | @@ -236,10 +277,12 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy min_score_j_idx = matrix_size - 1; } } + STAT_RECORD_STEP(STAT_DPD, 3); /* D matrix at this point : (assuming COST_SUB=1, diagonal = 3 ie: NB_DIAG=5, and matrix_size=12) * (0,0) in bottom left * An x corresponds to a value that has been computed. + * An M corresponds to PQD_INIT_VAL * * +-----------------------+ * | M x x x| @@ -266,579 +309,155 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_sy min_score_j_idx = j; } } + STAT_RECORD_STEP(STAT_DPD, 4); // backtrack step { int i = min_score_i_idx; int j = min_score_j_idx; backtrack[0].type = CODE_END; + (*backtrack_end) = &backtrack[1]; while ((i > 0) && (j > 0)) { if(X[i][j] == 0) { - i--; - j--; - } else { - if(X[i][j] == 1) { - i--; - j--; - backtrack[align_distance].type = CODE_SUB; - backtrack[align_distance].ix = i; - backtrack[align_distance].jx = j; - align_distance++; - } else { - if(X[i][j] == 2) { - j--; - backtrack[align_distance].type = CODE_INS; - backtrack[align_distance].ix = i; - backtrack[align_distance].jx = j; - align_distance++; - } - else { - i--; - backtrack[align_distance].type = CODE_DEL; - backtrack[align_distance].ix = i; - backtrack[align_distance].jx = j; - align_distance++; - } - } - } - } + // Operation 0 : sequences match and nothing was done : decrease both indices. + i--; + j--; + (*backtrack_end)->type = 0; // FIXME : use CODE_??? instead of 0 + (*backtrack_end)->ix = i; + (*backtrack_end)->jx = j; + (*backtrack_end)++; + } else if(X[i][j] == 1) { + // Operation 1 : substitution : decrease both indices and store substitution code. + i--; + j--; + (*backtrack_end)->type = CODE_SUB; + (*backtrack_end)->ix = i; + (*backtrack_end)->jx = j; + (*backtrack_end)++; + } else if(X[i][j] == 2) { + // Operation 2 : insertion : decrease read index but not genome index and store insertion code. + j--; + (*backtrack_end)->type = CODE_INS; + (*backtrack_end)->ix = i; + (*backtrack_end)->jx = j; + (*backtrack_end)++; + } else { + // Operation 3 : deletion : decrease genome index but not read index and store deletion code. + i--; + (*backtrack_end)->type = CODE_DEL; + (*backtrack_end)->ix = i; + (*backtrack_end)->jx = j; + (*backtrack_end)++; + } + } + backtrack_end--; } - return align_distance; + STAT_RECORD_LAST_STEP(STAT_DPD, 5); + return min_score; } -/* - * encoding of differences between read and sequence of the reference genome. - * coding: substitution CODE_SUB pos x - * deletion CODE_DEL pos x+ - * insertion CODE_INS pos x+ - * end CODE_END - * - * x = A | C | G | T - * x+ = a sequence of at least 1 element (i.e. A, C, G ou T) - * pos = integer (8 bits) : give the offset of the variant from the start of the read - * - * example S 12 A D 56 A T G I 87 T C X ==> substitution (A) position 12, deletion (ATG) position 56, insertion (TC) position 87 - * The code is return in "code" as a table of int8_t - */ +#define NB_FREQ_TABLE_MUTEXES (1<<14) +static pthread_mutex_t freq_table_mutexes[NB_FREQ_TABLE_MUTEXES+1]; +//static pthread_mutex_t print_mutex; -#ifdef USE_INDEL -static int code_alignment(uint8_t *code, int score, int8_t *gen, int8_t *read, unsigned size_neighbour_in_symbols, bool *flag) +bool update_frequency_table( + genome_t *ref_genome, + dpu_result_out_t *result_tab, + int8_t *reads_buffer, + float *reads_quality_buffer, + int pos, + float mapq + ) { - int code_idx, computed_score, backtrack_idx; - int size_read = SIZE_READ; - int size_neighbour = size_neighbour_in_symbols; - backtrack_t backtrak[size_read]; - - *flag = false; - - if (score == 0) { - code[0] = CODE_END; - return 1; - } - - /* First, looking for subsititution only */ - code_idx = 0; - computed_score = 0; - for (int i = SIZE_SEED; i < size_neighbour + SIZE_SEED; i++) { - if ((gen[i] & 3) != read[i]) { - computed_score += COST_SUB; - code[code_idx++] = CODE_SUB; - code[code_idx++] = i; - code[code_idx++] = read[i]; - if (computed_score > score) { - break; + struct frequency_info ** frequency_table = get_frequency_table(); + uint64_t genome_pos = ref_genome->pt_seq[result_tab[pos].coord.seq_nr] + result_tab[pos].coord.seed_nr; + int num = result_tab[pos].num; + int8_t *read = reads_buffer + (num * SIZE_READ); + float *read_quality = reads_quality_buffer + (num>>1)*SIZE_READ; + bool invert_read = num & 1; + + backtrack_t backtrack[SIZE_READ<<1]; + backtrack_t * backtrack_end = backtrack; + + /* First, try substitutions only */ + unsigned int computed_score = subOnlyPath(&ref_genome->data[genome_pos], read, backtrack, &backtrack_end); + bool used_dpd = false; + + if (computed_score > result_tab[pos].score) { + /* + if (result_tab[pos].coord.nodp) { + LOG_ERROR("dpu result found with nodp but score not matched with sub only (DPU:%u, host:%d)\n", result_tab[pos].score, computed_score); } - } - } - code[code_idx++] = CODE_END; - if (computed_score == score) - return code_idx; - - /* Otherwise, re-compute the matrix (only some diagonals) and put in backtrack the path */ - backtrack_idx = DPD(gen, read, backtrak, size_neighbour_in_symbols + SIZE_SEED); - if (backtrack_idx == -1) { - code[0] = CODE_ERR; - return 1; - } - - backtrack_idx--; - code_idx = 0; - while (backtrack_idx > 0) { - if (backtrak[backtrack_idx].type == CODE_SUB) { - code[code_idx++] = CODE_SUB; - code[code_idx++] = backtrak[backtrack_idx].jx - 1; - code[code_idx++] = read[backtrak[backtrack_idx].jx - 1]; - backtrack_idx--; - } else { - if (backtrak[backtrack_idx].type == CODE_DEL) { - int backtrack_jx = backtrak[backtrack_idx].jx; - code[code_idx++] = CODE_DEL; - code[code_idx++] = backtrak[backtrack_idx].ix; - code[code_idx++] = gen[backtrak[backtrack_idx].ix] & 3; - backtrack_idx--; - while ((backtrak[backtrack_idx].type == CODE_DEL) && (backtrack_jx == backtrak[backtrack_idx].jx)) { - code[code_idx++] = gen[backtrak[backtrack_idx].ix] & 3; - backtrack_idx--; - } + */ + computed_score = DPD(&ref_genome->data[genome_pos], read, backtrack, &backtrack_end); + used_dpd = true; + if (computed_score == result_tab[pos].score) { + reads_correct_cost_DPD++; } else { - int backtrack_ix = backtrak[backtrack_idx].ix; - code[code_idx++] = CODE_INS; - code[code_idx++] = backtrak[backtrack_idx].jx - 1; - code[code_idx++] = read[backtrak[backtrack_idx].jx]; - backtrack_idx--; - while ((backtrak[backtrack_idx].type == CODE_INS) && (backtrack_ix == backtrak[backtrack_idx].ix)) { - code[code_idx++] = read[backtrak[backtrack_idx].jx]; - backtrack_idx--; - } + reads_not_correct_cost++; } - } - } - code[code_idx++] = CODE_END; - return code_idx; -} -#endif - -#if 0 -static void set_variant( - dpu_result_out_t result_match, genome_t *ref_genome, int8_t *reads_buffer, unsigned int size_neighbour_in_symbols) -{ - uint32_t code_result_idx; - uint8_t code_result_tab[256]; - int8_t *read; - char nucleotide[4] = { 'A', 'C', 'T', 'G' }; - uint64_t genome_pos = ref_genome->pt_seq[result_match.coord.seq_nr] + result_match.coord.seed_nr; - int size_read = SIZE_READ; - //LOG_TRACE("set_variant called\n"); - - /* Get the differences betweend the read and the sequence of the reference genome that match */ - read = &reads_buffer[result_match.num * size_read]; - code_alignment(code_result_tab, result_match.score, &ref_genome->data[genome_pos], read, size_neighbour_in_symbols); - if (code_result_tab[0] == CODE_ERR) - return; - - /* Update "mapping_coverage" with the number of reads that match at this position of the genome */ - for (int i = 0; i < size_read; i++) { - ref_genome->mapping_coverage[genome_pos + i] += 1; - } - -#if DEBUG_READ_MAPPING - // TODO: check genome_pos is the expected value - write_read_mapping(genome_pos, code_result_tab); -#endif - - code_result_idx = 0; - while (code_result_tab[code_result_idx] != CODE_END) { - int code_result = code_result_tab[code_result_idx]; - //LOG_DEBUG("code_result=%d\n", code_result); - int64_t pos_variant_read = code_result_tab[code_result_idx + 1]; - int64_t pos_variant_genome = genome_pos + pos_variant_read; - int ref_pos = 0; - int alt_pos = 0; - variant_t *newvar = (variant_t *)malloc(sizeof(variant_t)); - newvar->depth = 1; - newvar->score = result_match.score; - newvar->next = NULL; - if (code_result == CODE_SUB) { - /* SNP = 0,1,2,3 (code A,C,T,G) */ - int snp = code_result_tab[code_result_idx + 2]; - newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3]; - newvar->alt[alt_pos++] = nucleotide[snp & 3]; - - code_result_idx += 3; - } else if (code_result == CODE_INS) { - int64_t ps_var_genome = pos_variant_genome; - int64_t ps_var_read = pos_variant_read; - code_result_idx += 2; - - while (code_result_tab[code_result_idx] < 4) { - ps_var_read++; - code_result_idx++; + /* + if (computed_score > result_tab[pos].score) { + pthread_mutex_lock(&print_mutex); + LOG_WARN("found a computed score of %d where DPU found score of %u\n", computed_score, result_tab[pos].score); + printf("ref: "); + log_nucleotides(&(ref_genome->data[genome_pos]), 130); + printf("\nread: "); + log_nucleotides(read, 120); + printf("\n"); + pthread_mutex_unlock(&print_mutex); } - - while (ref_genome->data[ps_var_genome] == read[ps_var_read]) { - ps_var_genome--; - ps_var_read--; - pos_variant_genome--; - pos_variant_read--; - } - - newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3]; - - while (pos_variant_read <= ps_var_read) { - newvar->alt[alt_pos++] = nucleotide[read[pos_variant_read] & 3]; - if (alt_pos >= MAX_SIZE_ALLELE - 1) { - free(newvar); - return; - } - pos_variant_read++; - } - - } else if (code_result == CODE_DEL) { - int64_t ps_var_genome = pos_variant_genome; - int64_t ps_var_read = pos_variant_read; - code_result_idx += 2; - - while (code_result_tab[code_result_idx] < 4) { - ps_var_genome++; - code_result_idx++; - } - - while (ref_genome->data[ps_var_genome] == read[ps_var_read]) { - ps_var_read--; - ps_var_genome--; - pos_variant_genome--; - pos_variant_read--; - } - - newvar->alt[alt_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3]; - - while (pos_variant_genome <= ps_var_genome) { - newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3]; - if (ref_pos >= MAX_SIZE_ALLELE - 1) { - free(newvar); - //LOG_TRACE("set_variant early return\n"); - return; - } - pos_variant_genome++; - } - pos_variant_genome -= ref_pos; - } - newvar->ref[ref_pos] = '\0'; - newvar->alt[alt_pos] = '\0'; - variant_tree_insert( - newvar, result_match.coord.seq_nr, pos_variant_genome + 1 - ref_genome->pt_seq[result_match.coord.seq_nr]); - } - //LOG_TRACE("set_variant return\n"); -} -#endif - -static pthread_mutex_t non_mapped_mutex; -static void add_to_non_mapped_read(int numread, int round, FILE *fpe1, FILE *fpe2, int8_t *reads_buffer) -{ - if (fpe1 == NULL || fpe2 == NULL) - return; - pthread_mutex_lock(&non_mapped_mutex); - char nucleotide[4] = { 'A', 'C', 'T', 'G' }; - int size_read = SIZE_READ; - int8_t *read = &reads_buffer[numread * size_read]; - fprintf(fpe1, ">>%d\n", SIZE_SEED * (round + 1)); - for (int j = SIZE_SEED; j < size_read; j++) { - fprintf(fpe1, "%c", nucleotide[read[j] & 3]); - } - for (int j = 0; j < SIZE_SEED; j++) { - fprintf(fpe1, "A"); + */ + } else { + reads_correct_cost_sub_only++; } - fprintf(fpe1, "\n"); - read = &reads_buffer[(numread + 2) * size_read]; - fprintf(fpe2, ">>%d\n", SIZE_SEED * (round + 1)); - for (int j = SIZE_SEED; j < size_read; j++) { - fprintf(fpe2, "%c", nucleotide[read[j] & 3]); - } - for (int j = 0; j < SIZE_SEED; j++) { - fprintf(fpe2, "A"); - } - fprintf(fpe2, "\n"); - pthread_mutex_unlock(&non_mapped_mutex); -} - -#ifdef USE_INDEL -int get_read_update_positions( - uint64_t * update_genome_position, - dpu_result_out_t *result_tab, - int pos, - genome_t *ref_genome, - uint64_t genome_pos, - int8_t *read, - __attribute__((unused))int size_neighbour_in_symbols, - bool * flag, - bool debug, - uint32_t * substCnt, - char * chromosome_name) { - - // run smith and waterman algorithm to find indels - uint8_t code_result_tab[256]; - code_alignment(code_result_tab, result_tab[pos].score, &ref_genome->data[genome_pos], read, size_neighbour_in_symbols, flag); - write_read_mapping(chromosome_name, result_tab[pos].coord.seed_nr, code_result_tab, (uint8_t*) read); - for(int read_pos = 0; read_pos < SIZE_READ; ++read_pos) { - update_genome_position[read_pos] = 0; - } - if (code_result_tab[0] != CODE_ERR) { - - // array that contains for each read position, the genome position that it matches with - // This is the genome position that will be updated in the frequency table - // This genome position takes into account the shift due to possible indels found - // with smith-waterman algorithm - int code_result_index = 0; - int ref_pos = 0; - int nbIndels = 0; - bool ins = false; - while (code_result_tab[code_result_index] != CODE_END) { - int code_result = code_result_tab[code_result_index]; - int64_t pos_variant_read = code_result_tab[code_result_index + 1]; - /*printf("pos variant: %lu\n", pos_variant_read);*/ - int64_t pos_variant_genome = genome_pos + pos_variant_read; - if (code_result == CODE_SUB) { - // do nothing for substitution - code_result_index += 3; - (*substCnt)++; - ref_pos++; - } - else if (code_result == CODE_INS) { - ins = true; - int64_t ps_var_genome = pos_variant_genome; - int64_t ps_var_read = pos_variant_read; - code_result_index += 2; - - while (code_result_tab[code_result_index] < 4) { - ps_var_read++; - code_result_index++; - } - while (ref_genome->data[ps_var_genome] == read[ps_var_read] && ps_var_genome - && pos_variant_read) { - assert(ps_var_genome && ps_var_read && pos_variant_genome && pos_variant_read); - ps_var_genome--; - ps_var_read--; - pos_variant_genome--; - pos_variant_read--; + write_read_mapping_from_backtrack(ref_genome->seq_name[result_tab[pos].coord.seq_nr], genome_pos, backtrack_end, read); + + // /!\ Pointer arithmetics + unsigned int mutex_index = (genome_pos*NB_FREQ_TABLE_MUTEXES)/ref_genome->fasta_file_size; + unsigned int end_mutex_index = ((genome_pos+backtrack_end->ix)*NB_FREQ_TABLE_MUTEXES)/ref_genome->fasta_file_size; + pthread_mutex_lock(&freq_table_mutexes[mutex_index]); + if (end_mutex_index != mutex_index) { + if (end_mutex_index>=NB_FREQ_TABLE_MUTEXES) { + printf("end_mutex_index=%u\n", end_mutex_index); + if(used_dpd) { + printf("used dpd\n"); } - - /*newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3];*/ - ref_pos++; - - // skip first value which should be the equivalent of first element in ref genome - pos_variant_read++; - while (pos_variant_read <= ps_var_read) { - // position should not be updated yet - if(update_genome_position[pos_variant_read] != 0) { - LOG_WARN("duplicate update (Insertion) at position %lu. Current %lu\n", - pos_variant_read, update_genome_position[pos_variant_read]); - fflush(stdout); - return -1; - } - update_genome_position[pos_variant_read++] = UINT64_MAX; - } - ++nbIndels; - } - else if (code_result == CODE_DEL) { - - int64_t ps_var_genome = pos_variant_genome; - int64_t ps_var_read = pos_variant_read; - code_result_index += 2; - - while (code_result_tab[code_result_index] < 4) { - ps_var_genome++; - code_result_index++; - } - - while (ref_genome->data[ps_var_genome] == read[ps_var_read] && pos_variant_genome && ps_var_read) { - assert(ps_var_genome && ps_var_read && pos_variant_genome && pos_variant_read); - ps_var_read--; - ps_var_genome--; - pos_variant_genome--; - pos_variant_read--; - } - - // on a deletion store the threshold to apply from the current read position - assert(ps_var_genome > pos_variant_genome); - if(pos_variant_read + 1 < SIZE_READ) { - // position should not be updated yet - if(update_genome_position[pos_variant_read+1] != 0) { - LOG_WARN("duplicate update (Deletion) at position %lu. Current %lu\n", - pos_variant_read+1, update_genome_position[pos_variant_read+1]); - fflush(stdout); - return -1; - } - /*assert(update_genome_position[pos_variant_read+1] == 0);*/ - update_genome_position[pos_variant_read + 1] = ps_var_genome - pos_variant_genome; - } - while (pos_variant_genome <= ps_var_genome) { - pos_variant_genome++; - ref_pos++; - } - pos_variant_genome -= ref_pos; - ++nbIndels; - } - else - assert(0); + fflush(stdout); } - - // debug prints - if(nbIndels && debug) - printf("SW algorithm (nbIndels %d) ins %d:\n", nbIndels, ins); - int64_t curr_pos = genome_pos; - for(int read_pos = 0; read_pos < SIZE_READ; ++read_pos) { - switch(update_genome_position[read_pos]) { - case 0: - update_genome_position[read_pos] = curr_pos++; - if(nbIndels && debug) - printf(" "); - break; - case UINT64_MAX: - if(nbIndels && debug) - printf("I"); - break; - default: - if(nbIndels && debug) { - for(uint64_t print_index = 0; print_index < update_genome_position[read_pos]; ++print_index) - printf("D"); - } - curr_pos += update_genome_position[read_pos]; - update_genome_position[read_pos] = curr_pos++; + pthread_mutex_lock(&freq_table_mutexes[end_mutex_index]); + } + bool has_indel = false; + uint8_t read_letter; + //printf("genome_pos=%lu\n", genome_pos); + for (; backtrack_end > backtrack; backtrack_end--) { + unsigned int current_position = genome_pos+backtrack_end->ix; + //printf("backtrack_end->ix=%u; current_position=%u\n", backtrack_end->ix, current_position); + if (current_position > ref_genome->fasta_file_size) { + continue; } - } - - if(nbIndels && debug) { - printf("\n"); - fflush(stdout); - } - return nbIndels; - } - else - assert(0); - - return false; -} -#endif - - -static pthread_mutex_t freq_table_mutex; - -/** - * function to update frequency table used for variant calling - **/ -bool update_frequency_table( - genome_t *ref_genome, - dpu_result_out_t *result_tab, - int8_t *reads_buffer, - float *reads_quality_buffer, - int pos, - float mapq, - __attribute__((unused))int size_neighbour_in_symbols) { - - struct frequency_info **frequency_table = get_frequency_table(); - uint64_t genome_pos = ref_genome->pt_seq[result_tab[pos].coord.seq_nr] + result_tab[pos].coord.seed_nr; - int num = result_tab[pos].num; - int8_t *read = reads_buffer + (num * SIZE_READ); - float *read_quality = reads_quality_buffer + (num/2 * SIZE_READ); - //TODO: read the quality in the correct order (inverted or not) - bool inv = num & 1; - //TODO: assume no offset here - -#ifdef USE_INDEL - - static bool debug = false; - static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; - uint64_t update_genome_position[SIZE_READ]; - uint32_t substCnt = 0; - flag_dbg = false; - - // for simplicity put all this in a critical section protected by a mutex - // since the frequency table is shared (but inefficient) - - pthread_mutex_lock(&freq_table_mutex); - int nbIndels = get_read_update_positions(update_genome_position, result_tab, pos, - ref_genome, genome_pos, read, size_neighbour_in_symbols, &flag_dbg, debug, &substCnt, ref_genome->seq_name[result_tab[pos].coord.seq_nr]); - bool hasIndel = nbIndels > 0; - - // debug prints - if(hasIndel && debug) { - - printf("Read:\n"); - for(int k = 0; k < SIZE_READ; ++k) { - printf("%c", nucleotide[read[k]]); - } - printf("\ngenome (pos %u:%u):\n", result_tab[pos].coord.seed_nr, result_tab[pos].coord.seed_nr+SIZE_READ); - for(uint64_t k = genome_pos; k < genome_pos + SIZE_READ; ++k) { - printf("%c", nucleotide[ref_genome->data[k]]); - } - printf("\nupdate pos:\n"); - uint64_t lastpos = 0; - for(uint64_t k = 0; k < SIZE_READ; ++k) { - if(k && update_genome_position[k] != lastpos+1) { - if(update_genome_position[k] == UINT64_MAX) - printf("X"); - /*printf("No update at position %lu\n", k);*/ - else if(lastpos == UINT64_MAX) - /*printf("New start at pos %lu = %lu / %c\n", k, update_genome_position[k], nucleotide[ref_genome->data[update_genome_position[k]]]);*/ - printf("%c", nucleotide[ref_genome->data[update_genome_position[k]]]); - else - /*printf("Change at pos %lu, diff %ld, %c\n", k, update_genome_position[k] - lastpos, nucleotide[ref_genome->data[update_genome_position[k]]]);*/ - printf("%c", nucleotide[ref_genome->data[update_genome_position[k]]]); + switch (backtrack_end->type) { + case 0: + case CODE_SUB: + read_letter = read[backtrack_end->jx]; + frequency_table[read_letter][current_position].freq += mapq * read_quality[invert_read ? SIZE_READ-backtrack_end->jx-1 : backtrack_end->jx]; + frequency_table[read_letter][current_position].score++; + break; + case CODE_INS: + case CODE_DEL: + has_indel = true; + // TODO: handle indels + break; + // TODO: handle errors even if they should never happen } - /*else if(nucleotide[ref_genome->data[update_genome_position[k]]] != nucleotide[read[k]]) {*/ - /*printf(RED "%c" RESET, nucleotide[ref_genome->data[update_genome_position[k]]]);*/ - /*}*/ - else - printf("%c", nucleotide[ref_genome->data[update_genome_position[k]]]); - lastpos = update_genome_position[k]; - } - - printf("\nsubst:\n"); - for(uint64_t k = 0; k < SIZE_READ; ++k) { - if(update_genome_position[k] == UINT64_MAX) { - printf(" "); - continue; - } - else if(nucleotide[ref_genome->data[update_genome_position[k]]] != nucleotide[read[k]]) { - printf("U"); - substCnt++; - } - else - printf(" "); - } - printf("\n\n"); - fflush(stdout); - assert(!result_tab[pos].coord.nodp); - } - else if(debug) { - if(!result_tab[pos].coord.nodp) { - LOG_WARN("odpd result with no indels detected (flag = %d, subst cnt %u)):\n", flag_dbg, substCnt); - printf("Read:\n"); - for(int k = 0; k < SIZE_READ; ++k) { - printf("%c", nucleotide[read[k]]); - } - printf("\ngenome (pos %u:%u):\n", result_tab[pos].coord.seed_nr, result_tab[pos].coord.seed_nr+SIZE_READ); - for(uint64_t k = genome_pos; k < genome_pos + SIZE_READ; ++k) { - printf("%c", nucleotide[ref_genome->data[k]]); - } - printf("\n\n"); - fflush(stdout); - } - } - - /*pthread_mutex_lock(&freq_table_mutex);*/ - // for the moment support only one indel otherwise we have some issue FIXME - if(substCnt <= MAX_SUBSTITUTION && (hasIndel || result_tab[pos].coord.nodp) && nbIndels >= 0) { - for(uint64_t k = 0; k < SIZE_READ; ++k) { - uint64_t update_genome_pos = update_genome_position[k]; - if(update_genome_pos < genome_get()->fasta_file_size) { - frequency_table[read[k]][update_genome_pos].freq += mapq * read_quality[inv ? SIZE_READ - k - 1 : k]; - /*frequency_table[read[j]][genome_pos+j].score += result_tab[pos].score;*/ - frequency_table[read[k]][update_genome_pos].score++; - } - else if (update_genome_pos != UINT64_MAX) - LOG_WARN("genome update position computed is wrong %lu\n", update_genome_pos); - } - } - /*fflush(stdout);*/ - pthread_mutex_unlock(&freq_table_mutex); - return hasIndel; - -#else - pthread_mutex_lock(&freq_table_mutex); - for(int j = 0; j < SIZE_READ; ++j) { - if(genome_pos + j < genome_get()->fasta_file_size) { - frequency_table[read[j]][genome_pos+j].freq += mapq * read_quality[inv ? SIZE_READ - j - 1 : j]; - frequency_table[read[j]][genome_pos+j].score++; - } - else - LOG_WARN("reads matched at position that exceeds genome size\n"); } - pthread_mutex_unlock(&freq_table_mutex); - return false; -#endif + pthread_mutex_unlock(&freq_table_mutexes[mutex_index]); + if (end_mutex_index != mutex_index) { + pthread_mutex_unlock(&freq_table_mutexes[end_mutex_index]); + } + return has_indel; } static volatile unsigned int curr_match; @@ -875,7 +494,7 @@ static uint64_t nr_reads_with_indels = 0ULL; static pthread_mutex_t nr_reads_mutex = PTHREAD_MUTEX_INITIALIZER; #define MISMATCH_COUNT(X) (X.score / 10) -#define INVALID_SCORE 1000 +#define INVALID_SCORE 10000 static void keep_best_2_scores(unsigned score, unsigned* P1, unsigned *P2, unsigned x1, unsigned x2, unsigned* best_score) { @@ -909,9 +528,43 @@ static unsigned get_nb_scores(unsigned int * best_score) { return np; } +static pthread_mutex_t non_mapped_mutex; +static void add_to_non_mapped_read(int numread, int round, FILE *fpe1, FILE *fpe2, int8_t *reads_buffer) +{ + STAT_RECORD_START(STAT_ADD_TO_NON_MAPPED_READ); + if (fpe1 == NULL || fpe2 == NULL) + return; + pthread_mutex_lock(&non_mapped_mutex); + STAT_RECORD_STEP(STAT_ADD_TO_NON_MAPPED_READ, 0); + char nucleotide[4] = { 'A', 'C', 'T', 'G' }; + int size_read = SIZE_READ; + int8_t *read = &reads_buffer[numread * size_read]; + fprintf(fpe1, ">>%d\n", SIZE_SEED * (round + 1)); + for (int j = SIZE_SEED; j < size_read; j++) { + fprintf(fpe1, "%c", nucleotide[read[j] & 3]); + } + for (int j = 0; j < SIZE_SEED; j++) { + fprintf(fpe1, "A"); + } + fprintf(fpe1, "\n"); + read = &reads_buffer[(numread + 2) * size_read]; + fprintf(fpe2, ">>%d\n", SIZE_SEED * (round + 1)); + for (int j = SIZE_SEED; j < size_read; j++) { + fprintf(fpe2, "%c", nucleotide[read[j] & 3]); + } + for (int j = 0; j < SIZE_SEED; j++) { + fprintf(fpe2, "A"); + } + fprintf(fpe2, "\n"); + STAT_RECORD_STEP(STAT_ADD_TO_NON_MAPPED_READ, 1); + pthread_mutex_unlock(&non_mapped_mutex); + STAT_RECORD_LAST_STEP(STAT_ADD_TO_NON_MAPPED_READ, 2); +} + /*#define USE_MAPQ_SCORE*/ static void do_process_read(process_read_arg_t *arg) { + STAT_RECORD_START(STAT_DO_PROCESS_READ); const unsigned int nb_match = arg->nb_match; dpu_result_out_t *result_tab = arg->result_tab; int round = arg->round; @@ -920,8 +573,6 @@ static void do_process_read(process_read_arg_t *arg) genome_t *ref_genome = arg->ref_genome; FILE *fpe1 = arg->fpe1; FILE *fpe2 = arg->fpe2; - unsigned int size_neighbour_in_symbols = (SIZE_NEIGHBOUR_IN_BYTES - DELTA_NEIGHBOUR(round)) * 4; - /*printf("size_neighbour_in_symbols : %u", size_neighbour_in_symbols);*/ /* * The number of a pair is given by "num_read / 4 " (see dispatch_read function) @@ -938,19 +589,26 @@ static void do_process_read(process_read_arg_t *arg) */ while (true) { + STAT_RECORD_STEP(STAT_DO_PROCESS_READ, 0); unsigned int i; if ((i = acquire_curr_match()) >= nb_match) { release_curr_match(i); + STAT_RECORD_LAST_STEP(STAT_DO_PROCESS_READ, 6); return; } int numpair = result_tab[i].num / 4; unsigned int j = i; while ((j < nb_match) && (numpair == result_tab[j].num / 4)) { - j++; + if (ref_genome->pt_seq[result_tab[j].coord.seq_nr] + result_tab[j].coord.seed_nr == 39317815) { + printf("found 39317815 mapping at index %u during round %d\n", j, round); + } + j++; } release_curr_match(j); + STAT_RECORD_STEP(STAT_DO_PROCESS_READ, 1); - unsigned int best_individual_scores[4] = {UINT32_MAX}; + // find best mapping scores for each of the four reads considered + unsigned int best_individual_scores[4] = {UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX}; for (unsigned int x=i; x READ_DIST_LOWER_BOUND && (abs((int)pos2 - (int)pos1) < READ_DIST_UPPER_BOUND))) { + // update if this is one of the two best scores + keep_best_2_scores(result_tab[x1].score + result_tab[x2].score, P1, P2, x1, x2, best_score); + } + } + if (nb_considered_mappings[t1] < max_considered_mappings) { + considered_mappings[t1][nb_considered_mappings[t1]++] = x1; + } else { + LOG_ERROR("too many mappings to consider: ignoring some\n"); + } + } + } + if (j-i>4900) { + LOG_WARN("found %d mappings for 4 reads; of which [%u, %u, %u, %u] were considered with best scores : [%u, %u, %u, %u].\n", + j-i, + nb_considered_mappings[0], + nb_considered_mappings[1], + nb_considered_mappings[2], + nb_considered_mappings[3], + best_individual_scores[0], + best_individual_scores[1], + best_individual_scores[2], + best_individual_scores[3] + ); + } + */ /*unsigned int best_score_all = 1000;*/ // test all significant pairs of reads (0,3) & (1,2) for (unsigned int x1 = i; x1 < j; x1++) { @@ -972,7 +668,7 @@ static void do_process_read(process_read_arg_t *arg) pos1 = result_tab[x1].coord.seed_nr; if (result_tab[x1].score < best_individual_scores[t1] + MAX_SCORE_DIFFERENCE_WITH_BEST) { - for (unsigned int x2 = i + 1; x2 < j; x2++) { + for (unsigned int x2 = x1 + 1; x2 < j; x2++) { pos2 = result_tab[x2].coord.seed_nr; t2 = result_tab[x2].num % 4; if (t1 + t2 == 3) // select significant pair @@ -985,12 +681,14 @@ static void do_process_read(process_read_arg_t *arg) } } } + STAT_RECORD_STEP(STAT_DO_PROCESS_READ, 3); bool update = false; bool hasIndel = false; unsigned np = get_nb_scores(best_score); if (np > 0) { + LOG_DEBUG("found at least a pair (%u)\n", np); if(np == 2) { @@ -1011,8 +709,8 @@ static void do_process_read(process_read_arg_t *arg) #else if(delta > DIST_PAIR_THRESHOLD) { #endif - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); update = true; } } @@ -1029,17 +727,18 @@ static void do_process_read(process_read_arg_t *arg) #else if(delta > DIST_PAIR_THRESHOLD) { #endif - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); update = true; } } - } - if(true) { + } + if (true) { // check mapping of R1 and R2 independently - unsigned int best_score_R1[2] = { 1000, 1000 }; - unsigned int best_score_R2[2] = { 1000, 1000 }; + unsigned int best_score_R1[2] = { INVALID_SCORE, INVALID_SCORE }; + unsigned int best_score_R2[2] = { INVALID_SCORE, INVALID_SCORE }; + unsigned int throwaway[2]; P1[0] = 0; P2[0] = 0; P1[1] = 0; @@ -1047,10 +746,10 @@ static void do_process_read(process_read_arg_t *arg) for (unsigned int read = i; read < j; read++) { unsigned t1 = result_tab[read].num % 4; if(t1 < 2) { // PE1 or RPE1 - keep_best_2_scores(result_tab[read].score, P1, P2, read, 0, best_score_R1); + keep_best_2_scores(result_tab[read].score, P1, throwaway, read, 0, best_score_R1); } else { // PE2 or RPE2 - keep_best_2_scores(result_tab[read].score, P1, P2, 0, read, best_score_R2); + keep_best_2_scores(result_tab[read].score, throwaway, P2, 0, read, best_score_R2); } } @@ -1071,7 +770,7 @@ static void do_process_read(process_read_arg_t *arg) #else if(delta > DIST_SINGLE_THRESHOLD) { #endif - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); update = true; } } @@ -1090,7 +789,7 @@ static void do_process_read(process_read_arg_t *arg) #else if(delta > DIST_SINGLE_THRESHOLD) { #endif - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); update = true; } } @@ -1113,7 +812,7 @@ static void do_process_read(process_read_arg_t *arg) if(delta > DIST_SINGLE_THRESHOLD) { #endif - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); update = true; } } @@ -1132,10 +831,11 @@ static void do_process_read(process_read_arg_t *arg) #else if (delta > DIST_SINGLE_THRESHOLD) { #endif - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); update = true; } } + STAT_RECORD_STEP(STAT_DO_PROCESS_READ, 4); if(!update) { pthread_mutex_lock(&nr_reads_mutex); nr_reads_non_mapped++; @@ -1149,6 +849,7 @@ static void do_process_read(process_read_arg_t *arg) pthread_mutex_unlock(&nr_reads_mutex); } } + STAT_RECORD_STEP(STAT_DO_PROCESS_READ, 5); } #define PROCESS_READ_THREAD (8) @@ -1159,11 +860,13 @@ static bool stop_threads = false; void process_read(FILE *fpe1, FILE *fpe2, int round, unsigned int pass_id) { + STAT_RECORD_START(STAT_PROCESS_READ); + if (pass_id>10) + return; int8_t *reads_buffer = get_reads_buffer(pass_id); float *reads_quality_buffer = get_reads_quality_buffer(pass_id); acc_results_t acc_res = accumulate_get_result(pass_id); nr_reads_total += get_reads_in_buffer(pass_id) / 4; - curr_match = 0; args.nb_match = acc_res.nb_res; @@ -1174,11 +877,16 @@ void process_read(FILE *fpe1, FILE *fpe2, int round, unsigned int pass_id) args.fpe1 = fpe1; args.fpe2 = fpe2; + STAT_RECORD_STEP(STAT_PROCESS_READ, 0); pthread_barrier_wait(&barrier); + STAT_RECORD_STEP(STAT_PROCESS_READ, 1); do_process_read(&args); + STAT_RECORD_STEP(STAT_PROCESS_READ, 2); pthread_barrier_wait(&barrier); + STAT_RECORD_STEP(STAT_PROCESS_READ, 3); free(acc_res.results); + STAT_RECORD_LAST_STEP(STAT_PROCESS_READ, 4); } static void *process_read_thread_fct(void *arg) @@ -1202,7 +910,9 @@ void process_read_init() assert(pthread_mutex_init(&curr_match_mutex, NULL) == 0); assert(pthread_mutex_init(&non_mapped_mutex, NULL) == 0); - assert(pthread_mutex_init(&freq_table_mutex, NULL) == 0); + for (int i=0; inb_seq; seq_number++) { /* for each position in the sequence */ for (uint64_t seq_position = 0; seq_position < ref_genome->len_seq[seq_number]; seq_position++) { variant_t ** results = get_most_frequent_variant(ref_genome, frequency_table, seq_number, seq_position); + unsigned int total_score = 0; + total_score += frequency_table[0][ref_genome->pt_seq[seq_number] + seq_position].score; + total_score += frequency_table[1][ref_genome->pt_seq[seq_number] + seq_position].score; + total_score += frequency_table[2][ref_genome->pt_seq[seq_number] + seq_position].score; + total_score += frequency_table[3][ref_genome->pt_seq[seq_number] + seq_position].score; + total_coverage += total_score; + //total_score += frequency_table[4][ref_genome->pt_seq[seq_number] + seq_position].score; + if (total_score == 0) { + uncovered_nucleotides++; + } else if (total_score < 3) { + badly_covered_nucleotides++; + } else { + well_covered_nucleotides++; + if (total_score > 50) { + overly_covered_nucleotides++; + } + if (total_score > max_coverage) { + max_coverage = total_score; + chromosome_most_coverage = seq_number; + position_most_coverage = seq_position; + } + } int nb_var = 0; for(int i = 0; i < 5; ++i) { variant_t * var = results[i]; @@ -432,6 +462,26 @@ void create_vcf() free_frequency_table(); fclose(vcf_file); + unsigned int total_nucleotides = well_covered_nucleotides+badly_covered_nucleotides+uncovered_nucleotides; + printf("\tuncovered nucleotides: %u (%u.%u%%)\n", + uncovered_nucleotides, + uncovered_nucleotides*100/total_nucleotides, + uncovered_nucleotides*10000/total_nucleotides%100); + printf("\tbadly covered nucleotides (less than 3 reads): %u (%u.%u%%)\n", + badly_covered_nucleotides, + badly_covered_nucleotides*100/total_nucleotides, + badly_covered_nucleotides*10000/total_nucleotides%100); + printf("\twell covered nucleotides (3 reads or more): %u (%u.%u%%)\n", + well_covered_nucleotides, + well_covered_nucleotides*100/total_nucleotides, + well_covered_nucleotides*10000/total_nucleotides%100); + printf("\toverly covered nucleotides (more than 50 reads): %u (%u.%u%%)\n", + overly_covered_nucleotides, + overly_covered_nucleotides*100/total_nucleotides, + overly_covered_nucleotides*10000/total_nucleotides%100); + printf("\tmax coverage: %u reads\n", max_coverage); + printf("\tmax coverage position: chr%u:%u\n", chromosome_most_coverage, position_most_coverage); + printf("\ttotal coverage: %u (eq %u reads; or %ux coverage)\n", total_coverage, total_coverage/SIZE_READ, total_coverage/total_nucleotides); printf("\tnumber of variants: %d (multiple %d)\n", nb_variant, nb_pos_multiple_var); printf("\ttime: %lf s\n", my_clock() - start_time); } From 93c352dbf97030cdc1fa0e5352b0b1a7ba7d05ce Mon Sep 17 00:00:00 2001 From: amoisson Date: Wed, 19 Jan 2022 11:17:41 +0100 Subject: [PATCH 25/48] fixed backtrack_end offset --- host/src/processread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/host/src/processread.c b/host/src/processread.c index 22d9670..b999038 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -350,7 +350,7 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, backtrack_t ** backtrack (*backtrack_end)++; } } - backtrack_end--; + (*backtrack_end)--; } STAT_RECORD_LAST_STEP(STAT_DPD, 5); From ac3b97643b94e2dcfa00bb19e5a1e3ee60c3c45a Mon Sep 17 00:00:00 2001 From: amoisson Date: Fri, 21 Jan 2022 10:38:09 +0100 Subject: [PATCH 26/48] fixed small bug + added more debug prints --- host/inc/mapping_file.h | 2 +- host/inc/profiling.h | 31 ++++++++++++----------- host/src/mapping_file.c | 4 +-- host/src/processread.c | 55 ++++++++++++++++++++++++++--------------- host/src/upvc.c | 1 + 5 files changed, 55 insertions(+), 38 deletions(-) diff --git a/host/inc/mapping_file.h b/host/inc/mapping_file.h index 664287b..715369a 100644 --- a/host/inc/mapping_file.h +++ b/host/inc/mapping_file.h @@ -9,7 +9,7 @@ void open_mapping_file(); //TODO : either reuse this code or delete it //void write_mapping_read(uint64_t genome_pos, uint8_t *code, int8_t *read); -void write_read_mapping_from_backtrack(char *chromosome_name, uint64_t genome_pos, backtrack_t *backtrack_end, int8_t *read); +void write_read_mapping_from_backtrack(char *chromosome_name, uint64_t genome_pos, backtrack_t *backtrack_end, int8_t *read, int read_id); void write_read_mapping(char *chromosome_name, uint64_t genome_pos, uint8_t *code, uint8_t *read); void close_mapping_file(); diff --git a/host/inc/profiling.h b/host/inc/profiling.h index 52c100e..2e5a4a1 100644 --- a/host/inc/profiling.h +++ b/host/inc/profiling.h @@ -11,22 +11,23 @@ struct time_stat_t { clock_t substep_total_times[STAT_MAX_SUBSTEPS]; }; -struct time_stat_t profiling[14]; +struct time_stat_t profiling[15]; -#define STAT_DPD 0 -#define STAT_CODE_ALIGNMENT 1 -#define STAT_ADD_TO_NON_MAPPED_READ 2 -#define STAT_GET_READ_UPDATE_POSITIONS 3 -#define STAT_UPDATE_FREQUENCY_TABLE 4 -#define STAT_DO_PROCESS_READ 5 -#define STAT_PROCESS_READ 6 -#define STAT_EXEC_ROUND 7 -#define STAT_EXEC_DPUS 8 -#define STAT_THREAD_GET_READS 9 -#define STAT_THREAD_DISPATCH 10 -#define STAT_THREAD_ACC 11 -#define STAT_THREAD_PROCESS 12 -#define STAT_DO_MAPPING 13 +#define STAT_SUB_ONLY_PATH 0 +#define STAT_DPD 1 +#define STAT_CODE_ALIGNMENT 2 +#define STAT_ADD_TO_NON_MAPPED_READ 3 +#define STAT_GET_READ_UPDATE_POSITIONS 4 +#define STAT_UPDATE_FREQUENCY_TABLE 5 +#define STAT_DO_PROCESS_READ 6 +#define STAT_PROCESS_READ 7 +#define STAT_EXEC_ROUND 8 +#define STAT_EXEC_DPUS 9 +#define STAT_THREAD_GET_READS 10 +#define STAT_THREAD_DISPATCH 11 +#define STAT_THREAD_ACC 12 +#define STAT_THREAD_PROCESS 13 +#define STAT_DO_MAPPING 14 #define STAT_RECORD_START(FUNCTION) \ clock_t profiling_step_time, profiling_last_step_time; \ diff --git a/host/src/mapping_file.c b/host/src/mapping_file.c index 06aaeba..9345273 100644 --- a/host/src/mapping_file.c +++ b/host/src/mapping_file.c @@ -52,7 +52,7 @@ void open_mapping_file() //LOG_DEBUG("mapping header written\n"); } -void write_read_mapping_from_backtrack(char *chromosome_name, uint64_t genome_pos, backtrack_t *backtrack_end, int8_t *read) +void write_read_mapping_from_backtrack(char *chromosome_name, uint64_t genome_pos, backtrack_t *backtrack_end, int8_t *read, int read_id) { char patch[MAX_PATCH_LENGTH]; int patch_idx=MAX_PATCH_LENGTH; @@ -83,7 +83,7 @@ void write_read_mapping_from_backtrack(char *chromosome_name, uint64_t genome_po break; } } - fprintf(mapping_file, "%s\t%lu\t%s\n", chromosome_name, genome_pos, &patch[patch_idx]); + fprintf(mapping_file, "%s\t%lu\t%s\t%d\n", chromosome_name, genome_pos, &patch[patch_idx], read_id); } void write_read_mapping(char *chromosome_name, uint64_t genome_pos, uint8_t *code, uint8_t *read) { diff --git a/host/src/processread.c b/host/src/processread.c index b999038..62c3a7a 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -116,6 +116,7 @@ static void DPD_compute( int subOnlyPath(int8_t *s1, int8_t *s2, backtrack_t* backtrack, backtrack_t ** backtrack_end) { + STAT_RECORD_START(STAT_SUB_ONLY_PATH); int score = 0; backtrack[0].type = CODE_END; (*backtrack_end) = &backtrack[1]; @@ -133,6 +134,8 @@ int subOnlyPath(int8_t *s1, int8_t *s2, backtrack_t* backtrack, backtrack_t ** b (*backtrack_end)++; } } + (*backtrack_end)--; + STAT_RECORD_LAST_STEP(STAT_SUB_ONLY_PATH, 0); return score; } @@ -317,7 +320,7 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, backtrack_t ** backtrack int j = min_score_j_idx; backtrack[0].type = CODE_END; (*backtrack_end) = &backtrack[1]; - while ((i > 0) && (j > 0)) { + while ((i > 0) || (j > 0)) { if(X[i][j] == 0) { // Operation 0 : sequences match and nothing was done : decrease both indices. i--; @@ -370,6 +373,7 @@ bool update_frequency_table( float mapq ) { + STAT_RECORD_START(STAT_UPDATE_FREQUENCY_TABLE); struct frequency_info ** frequency_table = get_frequency_table(); uint64_t genome_pos = ref_genome->pt_seq[result_tab[pos].coord.seq_nr] + result_tab[pos].coord.seed_nr; int num = result_tab[pos].num; @@ -383,6 +387,7 @@ bool update_frequency_table( /* First, try substitutions only */ unsigned int computed_score = subOnlyPath(&ref_genome->data[genome_pos], read, backtrack, &backtrack_end); bool used_dpd = false; + STAT_RECORD_STEP(STAT_UPDATE_FREQUENCY_TABLE, 0); if (computed_score > result_tab[pos].score) { /* @@ -412,8 +417,11 @@ bool update_frequency_table( } else { reads_correct_cost_sub_only++; } + STAT_RECORD_STEP(STAT_UPDATE_FREQUENCY_TABLE, 1); + + write_read_mapping_from_backtrack(ref_genome->seq_name[result_tab[pos].coord.seq_nr], genome_pos, backtrack_end, read, num); - write_read_mapping_from_backtrack(ref_genome->seq_name[result_tab[pos].coord.seq_nr], genome_pos, backtrack_end, read); + STAT_RECORD_STEP(STAT_UPDATE_FREQUENCY_TABLE, 2); // /!\ Pointer arithmetics unsigned int mutex_index = (genome_pos*NB_FREQ_TABLE_MUTEXES)/ref_genome->fasta_file_size; @@ -429,6 +437,7 @@ bool update_frequency_table( } pthread_mutex_lock(&freq_table_mutexes[end_mutex_index]); } + STAT_RECORD_STEP(STAT_UPDATE_FREQUENCY_TABLE, 3); bool has_indel = false; uint8_t read_letter; //printf("genome_pos=%lu\n", genome_pos); @@ -448,15 +457,18 @@ bool update_frequency_table( case CODE_INS: case CODE_DEL: has_indel = true; + //LOG_WARN("unhandled indel\n"); // TODO: handle indels break; // TODO: handle errors even if they should never happen } } + STAT_RECORD_STEP(STAT_UPDATE_FREQUENCY_TABLE, 4); pthread_mutex_unlock(&freq_table_mutexes[mutex_index]); if (end_mutex_index != mutex_index) { pthread_mutex_unlock(&freq_table_mutexes[end_mutex_index]); } + STAT_RECORD_LAST_STEP(STAT_UPDATE_FREQUENCY_TABLE, 5); return has_indel; } @@ -599,9 +611,6 @@ static void do_process_read(process_read_arg_t *arg) int numpair = result_tab[i].num / 4; unsigned int j = i; while ((j < nb_match) && (numpair == result_tab[j].num / 4)) { - if (ref_genome->pt_seq[result_tab[j].coord.seq_nr] + result_tab[j].coord.seed_nr == 39317815) { - printf("found 39317815 mapping at index %u during round %d\n", j, round); - } j++; } release_curr_match(j); @@ -666,7 +675,7 @@ static void do_process_read(process_read_arg_t *arg) for (unsigned int x1 = i; x1 < j; x1++) { t1 = result_tab[x1].num % 4; pos1 = result_tab[x1].coord.seed_nr; - if (result_tab[x1].score < best_individual_scores[t1] + MAX_SCORE_DIFFERENCE_WITH_BEST) + if (result_tab[x1].score <= best_individual_scores[t1] + MAX_SCORE_DIFFERENCE_WITH_BEST) { for (unsigned int x2 = x1 + 1; x2 < j; x2++) { pos2 = result_tab[x2].coord.seed_nr; @@ -704,14 +713,16 @@ static void do_process_read(process_read_arg_t *arg) if(delta_corrected < 0) { LOG_WARN("negative delta for square root %d\n", delta_corrected); } - else if(delta > DIST_PAIR_THRESHOLD) { + else if(delta >= DIST_PAIR_THRESHOLD) { mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); #else - if(delta > DIST_PAIR_THRESHOLD) { + if(delta >= DIST_PAIR_THRESHOLD) { #endif hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); update = true; + } else { + LOG_WARN("unusable pair (%u)\n", result_tab[i].num/4); } } else if(np) { // only one result, take it @@ -722,18 +733,22 @@ static void do_process_read(process_read_arg_t *arg) if(delta_corrected < 0) { LOG_WARN("negative delta (np == 1) for square root %d\n", delta_corrected); } - else if(delta > DIST_PAIR_THRESHOLD) { + else if(delta >= DIST_PAIR_THRESHOLD) { mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); #else - if(delta > DIST_PAIR_THRESHOLD) { + if(delta >= DIST_PAIR_THRESHOLD) { #endif hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); update = true; + } else { + LOG_WARN("unusable pair (%u)\n", result_tab[i].num/4); } } - } - if (true) { + } else { + LOG_WARN("could not pair (%u)\n", result_tab[i].num/4); + } + if (false) { // check mapping of R1 and R2 independently unsigned int best_score_R1[2] = { INVALID_SCORE, INVALID_SCORE }; @@ -765,10 +780,10 @@ static void do_process_read(process_read_arg_t *arg) if(delta_corrected < 0) { LOG_WARN("negative delta (np1 == 2) for square root %d\n", delta_corrected); } - else if(delta > DIST_SINGLE_THRESHOLD) { + else if(delta >= DIST_SINGLE_THRESHOLD) { mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); #else - if(delta > DIST_SINGLE_THRESHOLD) { + if(delta >= DIST_SINGLE_THRESHOLD) { #endif hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); update = true; @@ -784,10 +799,10 @@ static void do_process_read(process_read_arg_t *arg) if(delta_corrected < 0) { LOG_WARN("negative delta (np1 == 1) for square root %d\n", delta_corrected); } - else if(delta > DIST_SINGLE_THRESHOLD) { + else if(delta >= DIST_SINGLE_THRESHOLD) { mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); #else - if(delta > DIST_SINGLE_THRESHOLD) { + if(delta >= DIST_SINGLE_THRESHOLD) { #endif hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); update = true; @@ -806,10 +821,10 @@ static void do_process_read(process_read_arg_t *arg) LOG_WARN("negative delta (np2 == 2) for square root %d, %d %d %d %d\n", delta_corrected, MISMATCH_COUNT(result_tab[P2[0]]), MISMATCH_COUNT(result_tab[P2[1]]), MAX_SUBSTITUTION + 1, delta); } - else if(delta > DIST_SINGLE_THRESHOLD) { + else if(delta >= DIST_SINGLE_THRESHOLD) { mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); #else - if(delta > DIST_SINGLE_THRESHOLD) { + if(delta >= DIST_SINGLE_THRESHOLD) { #endif hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); @@ -826,10 +841,10 @@ static void do_process_read(process_read_arg_t *arg) if(delta_corrected < 0) { LOG_WARN("negative delta (np2 == 1) for square root %d\n", delta_corrected); } - else if (delta > DIST_SINGLE_THRESHOLD) { + else if (delta >= DIST_SINGLE_THRESHOLD) { mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); #else - if (delta > DIST_SINGLE_THRESHOLD) { + if (delta >= DIST_SINGLE_THRESHOLD) { #endif hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); update = true; diff --git a/host/src/upvc.c b/host/src/upvc.c index eb5ac35..fdf3af2 100644 --- a/host/src/upvc.c +++ b/host/src/upvc.c @@ -407,6 +407,7 @@ int main(int argc, char *argv[]) PRINT_FUNCTION_STAT(STAT_UPDATE_FREQUENCY_TABLE); PRINT_FUNCTION_STAT(STAT_GET_READ_UPDATE_POSITIONS); PRINT_FUNCTION_STAT(STAT_CODE_ALIGNMENT); + PRINT_FUNCTION_STAT(STAT_SUB_ONLY_PATH); PRINT_FUNCTION_STAT(STAT_DPD); index_free(); From fa700ecf486a59774314e829fc375292a41b659b Mon Sep 17 00:00:00 2001 From: amoisson Date: Tue, 1 Feb 2022 10:00:41 +0100 Subject: [PATCH 27/48] some bug fixes and profiling mixed together --- dpu/src/task.c | 4 +- host/inc/profiling.h | 8 +-- host/src/processread.c | 127 ++++++++++++++++------------------------- host/src/upvc.c | 12 +++- 4 files changed, 67 insertions(+), 84 deletions(-) diff --git a/dpu/src/task.c b/dpu/src/task.c index f349f4c..dab0f99 100644 --- a/dpu/src/task.c +++ b/dpu/src/task.c @@ -56,9 +56,9 @@ __host dpu_compute_time_t DPU_COMPUTE_TIME_VAR; * @brief Maximum score allowed. */ #if SIZE_READ>120 -#define MAX_SCORE 294 +#define MAX_SCORE 128 #else -#define MAX_SCORE 144 +#define MAX_SCORE 86 #endif /** diff --git a/host/inc/profiling.h b/host/inc/profiling.h index 2e5a4a1..c5098ca 100644 --- a/host/inc/profiling.h +++ b/host/inc/profiling.h @@ -46,10 +46,10 @@ struct time_stat_t profiling[15]; #define PRINT_MICROSECONDS(t) \ - if (t<1000000) { \ - printf("%ld.%ldms", t/1000, t%1000); \ + if (t 0) { \ - printf("\t\t"); \ + printf("\t\t%d:\t", i); \ PRINT_MICROSECONDS(profiling[FUNCTION].substep_total_times[i]) \ printf("\n"); \ } \ diff --git a/host/src/processread.c b/host/src/processread.c index 62c3a7a..4b86c48 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -31,7 +31,7 @@ #define CODE_T 2 /* ('T'>>1)&3 54H 0101 0100 */ #define CODE_G 3 /* ('G'>>1)&3 47H 0100 0111 */ -#define PQD_INIT_VAL (999) +#define PQD_INIT_VAL (99) #if SIZE_READ>120 #define MAX_SUBSTITUTION 31 @@ -40,6 +40,7 @@ #endif #define MAX_SCORE_DIFFERENCE_WITH_BEST 40 +#define MAX_CONSIDERED_MAPPINGS 1000 /* static void log_nucleotides(int8_t *s, int max_len) { @@ -360,7 +361,7 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, backtrack_t ** backtrack return min_score; } -#define NB_FREQ_TABLE_MUTEXES (1<<14) +#define NB_FREQ_TABLE_MUTEXES (1<<10) static pthread_mutex_t freq_table_mutexes[NB_FREQ_TABLE_MUTEXES+1]; //static pthread_mutex_t print_mutex; @@ -386,7 +387,7 @@ bool update_frequency_table( /* First, try substitutions only */ unsigned int computed_score = subOnlyPath(&ref_genome->data[genome_pos], read, backtrack, &backtrack_end); - bool used_dpd = false; + //bool used_dpd = false; STAT_RECORD_STEP(STAT_UPDATE_FREQUENCY_TABLE, 0); if (computed_score > result_tab[pos].score) { @@ -396,7 +397,7 @@ bool update_frequency_table( } */ computed_score = DPD(&ref_genome->data[genome_pos], read, backtrack, &backtrack_end); - used_dpd = true; + //used_dpd = true; if (computed_score == result_tab[pos].score) { reads_correct_cost_DPD++; } else { @@ -428,13 +429,6 @@ bool update_frequency_table( unsigned int end_mutex_index = ((genome_pos+backtrack_end->ix)*NB_FREQ_TABLE_MUTEXES)/ref_genome->fasta_file_size; pthread_mutex_lock(&freq_table_mutexes[mutex_index]); if (end_mutex_index != mutex_index) { - if (end_mutex_index>=NB_FREQ_TABLE_MUTEXES) { - printf("end_mutex_index=%u\n", end_mutex_index); - if(used_dpd) { - printf("used dpd\n"); - } - fflush(stdout); - } pthread_mutex_lock(&freq_table_mutexes[end_mutex_index]); } STAT_RECORD_STEP(STAT_UPDATE_FREQUENCY_TABLE, 3); @@ -451,6 +445,9 @@ bool update_frequency_table( case 0: case CODE_SUB: read_letter = read[backtrack_end->jx]; + if (read_letter > 3) { + read_letter = read_letter>>1 & 0x3; + } frequency_table[read_letter][current_position].freq += mapq * read_quality[invert_read ? SIZE_READ-backtrack_end->jx-1 : backtrack_end->jx]; frequency_table[read_letter][current_position].score++; break; @@ -506,7 +503,7 @@ static uint64_t nr_reads_with_indels = 0ULL; static pthread_mutex_t nr_reads_mutex = PTHREAD_MUTEX_INITIALIZER; #define MISMATCH_COUNT(X) (X.score / 10) -#define INVALID_SCORE 10000 +#define INVALID_SCORE 1000 static void keep_best_2_scores(unsigned score, unsigned* P1, unsigned *P2, unsigned x1, unsigned x2, unsigned* best_score) { @@ -625,68 +622,47 @@ static void do_process_read(process_read_arg_t *arg) best_individual_scores[t] = result_tab[x].score; } + unsigned int nb_considered_mappings[4] = {0,0,0,0}; + unsigned int considered_mappings[4][MAX_CONSIDERED_MAPPINGS]; + for (unsigned int x=i; x result_tab[x].score && nb_considered_mappings[t] READ_DIST_LOWER_BOUND && (abs((int)pos2 - (int)pos1) < READ_DIST_UPPER_BOUND))) { - // update if this is one of the two best scores - keep_best_2_scores(result_tab[x1].score + result_tab[x2].score, P1, P2, x1, x2, best_score); - } - } - if (nb_considered_mappings[t1] < max_considered_mappings) { - considered_mappings[t1][nb_considered_mappings[t1]++] = x1; - } else { - LOG_ERROR("too many mappings to consider: ignoring some\n"); - } + + for (unsigned int x1 = 0; x1 < nb_considered_mappings[0]; x1++) { + pos1 = result_tab[considered_mappings[0][x1]].coord.seed_nr; + int score1 = result_tab[considered_mappings[0][x1]].score; + for (unsigned int x2 = 0; x2 < nb_considered_mappings[3]; x2++) { + pos2 = result_tab[considered_mappings[3][x2]].coord.seed_nr; + int score2 = result_tab[considered_mappings[3][x2]].score; + if ((abs((int)pos2 - (int)pos1) > READ_DIST_LOWER_BOUND && (abs((int)pos2 - (int)pos1) < READ_DIST_UPPER_BOUND))) { + // update if this is one of the two best scores + keep_best_2_scores(score1 + score2, P1, P2, considered_mappings[0][x1], considered_mappings[3][x2], best_score); + } } } - if (j-i>4900) { - LOG_WARN("found %d mappings for 4 reads; of which [%u, %u, %u, %u] were considered with best scores : [%u, %u, %u, %u].\n", - j-i, - nb_considered_mappings[0], - nb_considered_mappings[1], - nb_considered_mappings[2], - nb_considered_mappings[3], - best_individual_scores[0], - best_individual_scores[1], - best_individual_scores[2], - best_individual_scores[3] - ); - } - */ - /*unsigned int best_score_all = 1000;*/ - // test all significant pairs of reads (0,3) & (1,2) - for (unsigned int x1 = i; x1 < j; x1++) { - t1 = result_tab[x1].num % 4; - pos1 = result_tab[x1].coord.seed_nr; - if (result_tab[x1].score <= best_individual_scores[t1] + MAX_SCORE_DIFFERENCE_WITH_BEST) - { - for (unsigned int x2 = x1 + 1; x2 < j; x2++) { - pos2 = result_tab[x2].coord.seed_nr; - t2 = result_tab[x2].num % 4; - if (t1 + t2 == 3) // select significant pair - { - if ((abs((int)pos2 - (int)pos1) > READ_DIST_LOWER_BOUND && (abs((int)pos2 - (int)pos1) < READ_DIST_UPPER_BOUND))) { - // update if this is one of the two best scores - keep_best_2_scores(result_tab[x1].score + result_tab[x2].score, P1, P2, x1, x2, best_score); - } - } + for (unsigned int x1 = 0; x1 < nb_considered_mappings[1]; x1++) { + pos1 = result_tab[considered_mappings[1][x1]].coord.seed_nr; + int score1 = result_tab[considered_mappings[1][x1]].score; + for (unsigned int x2 = 0; x2 < nb_considered_mappings[2]; x2++) { + pos2 = result_tab[considered_mappings[2][x2]].coord.seed_nr; + int score2 = result_tab[considered_mappings[2][x2]].score; + if ((abs((int)pos2 - (int)pos1) > READ_DIST_LOWER_BOUND && (abs((int)pos2 - (int)pos1) < READ_DIST_UPPER_BOUND))) { + // update if this is one of the two best scores + keep_best_2_scores(score1 + score2, P1, P2, considered_mappings[1][x1], considered_mappings[2][x2], best_score); } } } @@ -721,9 +697,9 @@ static void do_process_read(process_read_arg_t *arg) hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); update = true; - } else { + }/* else { LOG_WARN("unusable pair (%u)\n", result_tab[i].num/4); - } + }*/ } else if(np) { // only one result, take it int delta = abs((int)(MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]])) - (2 * (MAX_SUBSTITUTION + 1))); @@ -741,14 +717,12 @@ static void do_process_read(process_read_arg_t *arg) hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); update = true; - } else { + }/* else { LOG_WARN("unusable pair (%u)\n", result_tab[i].num/4); - } + }*/ } - } else { - LOG_WARN("could not pair (%u)\n", result_tab[i].num/4); } - if (false) { + if (true) { // check mapping of R1 and R2 independently unsigned int best_score_R1[2] = { INVALID_SCORE, INVALID_SCORE }; @@ -852,9 +826,9 @@ static void do_process_read(process_read_arg_t *arg) } STAT_RECORD_STEP(STAT_DO_PROCESS_READ, 4); if(!update) { - pthread_mutex_lock(&nr_reads_mutex); + //pthread_mutex_lock(&nr_reads_mutex); nr_reads_non_mapped++; - pthread_mutex_unlock(&nr_reads_mutex); + //pthread_mutex_unlock(&nr_reads_mutex); add_to_non_mapped_read(numpair * 4, round, fpe1, fpe2, reads_buffer); } if(hasIndel) @@ -863,8 +837,9 @@ static void do_process_read(process_read_arg_t *arg) nr_reads_total_from_dpus++; pthread_mutex_unlock(&nr_reads_mutex); } + STAT_RECORD_STEP(STAT_DO_PROCESS_READ, 5); } - STAT_RECORD_STEP(STAT_DO_PROCESS_READ, 5); + STAT_RECORD_LAST_STEP(STAT_DO_PROCESS_READ, 6); } #define PROCESS_READ_THREAD (8) @@ -876,8 +851,6 @@ static bool stop_threads = false; void process_read(FILE *fpe1, FILE *fpe2, int round, unsigned int pass_id) { STAT_RECORD_START(STAT_PROCESS_READ); - if (pass_id>10) - return; int8_t *reads_buffer = get_reads_buffer(pass_id); float *reads_quality_buffer = get_reads_quality_buffer(pass_id); acc_results_t acc_res = accumulate_get_result(pass_id); diff --git a/host/src/upvc.c b/host/src/upvc.c index fdf3af2..45ab956 100644 --- a/host/src/upvc.c +++ b/host/src/upvc.c @@ -70,19 +70,27 @@ void exec_dpus() { backends_functions.load_mram(dpu_offset, 0); + STAT_RECORD_STEP(STAT_EXEC_DPUS, 0); + sem_wait(&dispatch_to_exec_sem); + STAT_RECORD_STEP(STAT_EXEC_DPUS, 1); + FOREACH_PASS(each_pass) { backends_functions.run_dpu( dpu_offset, each_pass, &exec_to_dispatch_sem, &acc_to_exec_sem, &exec_to_acc_sem, &dispatch_to_exec_sem); } + STAT_RECORD_STEP(STAT_EXEC_DPUS, 2); + backends_functions.wait_dpu(); + STAT_RECORD_STEP(STAT_EXEC_DPUS, 3); + sem_post(&exec_to_acc_sem); } - STAT_RECORD_LAST_STEP(STAT_EXEC_DPUS, 0); + STAT_RECORD_LAST_STEP(STAT_EXEC_DPUS, 4); } void *thread_dispatch(__attribute__((unused)) void *arg) @@ -343,6 +351,8 @@ int main(int argc, char *argv[]) printf("%s\n", VERSION); print_time(); + for (int i=0; i<15; i++) + profiling[i] = (struct time_stat_t){0}; printf("Information:\n"); printf("\tread size: %d\n", SIZE_READ); From c511ed74d74bd06ede3c8e4badb66c82c6b7d249 Mon Sep 17 00:00:00 2001 From: amoisson Date: Tue, 1 Feb 2022 14:07:59 +0100 Subject: [PATCH 28/48] 98.8% common match \o/ --- common/inc/common.h | 2 +- host/inc/accumulateread.h | 3 ++- host/src/accumulateread.c | 8 ++++++-- host/src/getread.c | 4 ++++ host/src/processread.c | 2 +- host/src/upvc.c | 5 +++++ 6 files changed, 19 insertions(+), 5 deletions(-) diff --git a/common/inc/common.h b/common/inc/common.h index e1a1cbd..0e66dc1 100644 --- a/common/inc/common.h +++ b/common/inc/common.h @@ -134,6 +134,6 @@ typedef struct { #define MAPQ_SCALING_FACTOR 2 #define READ_DIST_LOWER_BOUND 50 #define READ_DIST_UPPER_BOUND 2000 -#define depth_filter depth_filter_permissive +#define depth_filter depth_filter3 #endif /* __COMMON_H__ */ diff --git a/host/inc/accumulateread.h b/host/inc/accumulateread.h index 74f5c59..8be55a4 100644 --- a/host/inc/accumulateread.h +++ b/host/inc/accumulateread.h @@ -6,6 +6,7 @@ #define __ACCUMULATEREAD_H__ #include "common.h" +#include typedef struct { nb_result_t nb_res; @@ -13,7 +14,7 @@ typedef struct { } acc_results_t; acc_results_t *accumulate_get_buffer(unsigned int dpu_id, unsigned int pass_id); -acc_results_t accumulate_get_result(unsigned int pass_id); +acc_results_t accumulate_get_result(unsigned int pass_id, bool free_results); void accumulate_read(unsigned int pass_id, unsigned int dpu_offset); diff --git a/host/src/accumulateread.c b/host/src/accumulateread.c index 6b81477..615e1c6 100644 --- a/host/src/accumulateread.c +++ b/host/src/accumulateread.c @@ -158,7 +158,7 @@ static void merge_bucket_and_acc_list( } } -acc_results_t accumulate_get_result(unsigned int pass_id) +acc_results_t accumulate_get_result(unsigned int pass_id, bool free_results) { if (result_file[pass_id] == NULL) { static const dpu_result_out_t dummy_res = { .num = -1 }; @@ -180,6 +180,10 @@ acc_results_t accumulate_get_result(unsigned int pass_id) size_t size_read = fread(results, size, 1, result_file[pass_id]); assert(size_read == 1); + if (free_results) { + fclose(result_file[pass_id]); + result_file[pass_id] = NULL; + } return (acc_results_t) { .nb_res = (size / sizeof(dpu_result_out_t)) - 1, .results = results }; } @@ -232,7 +236,7 @@ void accumulate_read(unsigned int pass_id, unsigned int dpu_offset) } // Get data from FILE * - acc_results_t acc_res_from_file = accumulate_get_result(pass_id); + acc_results_t acc_res_from_file = accumulate_get_result(pass_id, false); unsigned int nb_read = acc_res_from_file.nb_res + total_nb_res; size_t size = sizeof(dpu_result_out_t) * (nb_read + 1); diff --git a/host/src/getread.c b/host/src/getread.c index e5c672d..27e3249 100644 --- a/host/src/getread.c +++ b/host/src/getread.c @@ -113,6 +113,10 @@ void get_reads(FILE *fpe1, FILE *fpe2, unsigned int pass_id) } while (nb_read < MAX_READS_BUFFER) { + /* + if (pass_id>=20) // FIXME : remove this condition used for debugging + break; + */ if ((get_seq_fast_AQ(fpe1, &reads_buffer[(nb_read + 0) * SIZE_READ], &reads_buffer[(nb_read + 1) * SIZE_READ], &reads_quality_buffer[nb_read/2 * SIZE_READ]) <= 0) || (get_seq_fast_AQ(fpe2, &reads_buffer[(nb_read + 2) * SIZE_READ], &reads_buffer[(nb_read + 3) * SIZE_READ], diff --git a/host/src/processread.c b/host/src/processread.c index 4b86c48..114ee03 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -853,7 +853,7 @@ void process_read(FILE *fpe1, FILE *fpe2, int round, unsigned int pass_id) STAT_RECORD_START(STAT_PROCESS_READ); int8_t *reads_buffer = get_reads_buffer(pass_id); float *reads_quality_buffer = get_reads_quality_buffer(pass_id); - acc_results_t acc_res = accumulate_get_result(pass_id); + acc_results_t acc_res = accumulate_get_result(pass_id, true); nr_reads_total += get_reads_in_buffer(pass_id) / 4; curr_match = 0; diff --git a/host/src/upvc.c b/host/src/upvc.c index 45ab956..ce29916 100644 --- a/host/src/upvc.c +++ b/host/src/upvc.c @@ -163,13 +163,18 @@ void *thread_acc(__attribute__((unused)) void *arg) void *thread_process(__attribute__((unused)) void *arg) { + struct timespec start_time, current_time; STAT_RECORD_START(STAT_THREAD_PROCESS); sem_wait(&acc_to_process_sem); + clock_gettime(CLOCK_MONOTONIC_RAW, &start_time); STAT_RECORD_STEP(STAT_THREAD_PROCESS, 0); FOREACH_PASS(each_pass) { process_read(fope1, fope2, round, each_pass); + clock_gettime(CLOCK_MONOTONIC_RAW, ¤t_time); + int spent = current_time.tv_sec-start_time.tv_sec; + printf("time spent: %02dh%02dm%02ds (%ds)\n", spent/3600, spent/60%60, spent%60, spent); sem_post(&accprocess_to_getreads_sem); sem_wait(&acc_to_process_sem); } From 2668efc8990d79761ebc14fc0c9f2836148fb30c Mon Sep 17 00:00:00 2001 From: amoisson Date: Thu, 3 Feb 2022 10:42:33 +0100 Subject: [PATCH 29/48] fixed max score bug --- dpu/src/odpd_opt.S | 2 +- host/src/processread.c | 2 +- host/src/simu_backend.c | 2 +- host/src/vartree.c | 38 +++++++++++++++++++------------------- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/dpu/src/odpd_opt.S b/dpu/src/odpd_opt.S index 3b5a4c2..feb5760 100644 --- a/dpu/src/odpd_opt.S +++ b/dpu/src/odpd_opt.S @@ -6,7 +6,7 @@ #define COST_SUB 10 #define COST_GAPO 11 #define COST_GAPE 1 -#define COST_INIT 99 +#define COST_INIT 999 #define LINE_SIZE ( 6*4 ) #define d0off ( 0*4 ) diff --git a/host/src/processread.c b/host/src/processread.c index 114ee03..9e73244 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -31,7 +31,7 @@ #define CODE_T 2 /* ('T'>>1)&3 54H 0101 0100 */ #define CODE_G 3 /* ('G'>>1)&3 47H 0100 0111 */ -#define PQD_INIT_VAL (99) +#define PQD_INIT_VAL (999) #if SIZE_READ>120 #define MAX_SUBSTITUTION 31 diff --git a/host/src/simu_backend.c b/host/src/simu_backend.c index 9c8d584..b0833c4 100644 --- a/host/src/simu_backend.c +++ b/host/src/simu_backend.c @@ -34,7 +34,7 @@ static unsigned int pass_id_shared; static int min(int a, int b) { return a < b ? a : b; } -#define PQD_INIT_VAL (99) +#define PQD_INIT_VAL (999) static void ODPD_compute( int i, int j, int8_t *s1, int8_t *s2, int *Pppj, int Pppjm, int *Qppj, int Qlpj, int *Dppj, int Dppjm, int Dlpj, int Dlpjm) { diff --git a/host/src/vartree.c b/host/src/vartree.c index e35ec80..9227e01 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -431,7 +431,7 @@ void create_vcf() //total_score += frequency_table[4][ref_genome->pt_seq[seq_number] + seq_position].score; if (total_score == 0) { uncovered_nucleotides++; - } else if (total_score < 3) { + } else if (total_score < 10) { badly_covered_nucleotides++; } else { well_covered_nucleotides++; @@ -462,26 +462,26 @@ void create_vcf() free_frequency_table(); fclose(vcf_file); - unsigned int total_nucleotides = well_covered_nucleotides+badly_covered_nucleotides+uncovered_nucleotides; - printf("\tuncovered nucleotides: %u (%u.%u%%)\n", - uncovered_nucleotides, - uncovered_nucleotides*100/total_nucleotides, - uncovered_nucleotides*10000/total_nucleotides%100); - printf("\tbadly covered nucleotides (less than 3 reads): %u (%u.%u%%)\n", - badly_covered_nucleotides, - badly_covered_nucleotides*100/total_nucleotides, - badly_covered_nucleotides*10000/total_nucleotides%100); - printf("\twell covered nucleotides (3 reads or more): %u (%u.%u%%)\n", - well_covered_nucleotides, - well_covered_nucleotides*100/total_nucleotides, - well_covered_nucleotides*10000/total_nucleotides%100); - printf("\toverly covered nucleotides (more than 50 reads): %u (%u.%u%%)\n", - overly_covered_nucleotides, - overly_covered_nucleotides*100/total_nucleotides, - overly_covered_nucleotides*10000/total_nucleotides%100); + unsigned long total_nucleotides = overly_covered_nucleotides+well_covered_nucleotides+badly_covered_nucleotides+uncovered_nucleotides; + printf("\tuncovered nucleotides: %lu (%lu.%lu%%)\n", + (long)uncovered_nucleotides, + (long)uncovered_nucleotides*100/total_nucleotides, + (long)uncovered_nucleotides*10000/total_nucleotides%100); + printf("\tbadly covered nucleotides (less than 10 reads): %lu (%lu.%lu%%)\n", + (long)badly_covered_nucleotides, + (long)badly_covered_nucleotides*100/total_nucleotides, + (long)badly_covered_nucleotides*10000/total_nucleotides%100); + printf("\twell covered nucleotides (10 reads or more): %lu (%lu.%lu%%)\n", + (long)well_covered_nucleotides, + (long)well_covered_nucleotides*100/total_nucleotides, + (long)well_covered_nucleotides*10000/total_nucleotides%100); + printf("\toverly covered nucleotides (more than 50 reads): %lu (%lu.%lu%%)\n", + (long)overly_covered_nucleotides, + (long)overly_covered_nucleotides*100/total_nucleotides, + (long)overly_covered_nucleotides*10000/total_nucleotides%100); printf("\tmax coverage: %u reads\n", max_coverage); printf("\tmax coverage position: chr%u:%u\n", chromosome_most_coverage, position_most_coverage); - printf("\ttotal coverage: %u (eq %u reads; or %ux coverage)\n", total_coverage, total_coverage/SIZE_READ, total_coverage/total_nucleotides); + printf("\ttotal coverage: %u (eq %lu reads; or %lux coverage)\n", total_coverage, (long)total_coverage/SIZE_READ, (long)total_coverage/total_nucleotides); printf("\tnumber of variants: %d (multiple %d)\n", nb_variant, nb_pos_multiple_var); printf("\ttime: %lf s\n", my_clock() - start_time); } From 56296522be831d56afb3ca46c8e6472c8808c275 Mon Sep 17 00:00:00 2001 From: amoisson Date: Fri, 4 Mar 2022 12:10:46 +0100 Subject: [PATCH 30/48] now using affine filter --- host/src/processread.c | 4 +- host/src/processread.c.old | 1099 ++++++++++++++++++++++++++++++++++++ host/src/upvc.c | 2 +- host/src/vartree.c | 13 +- 4 files changed, 1113 insertions(+), 5 deletions(-) create mode 100644 host/src/processread.c.old diff --git a/host/src/processread.c b/host/src/processread.c index 9e73244..5e2066d 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -21,7 +21,7 @@ #include "parse_args.h" #include "profiling.h" -#define DEBUG_READ_MAPPING true +#define DEBUG_READ_MAPPING false #define SIZE_INSERT_MEAN (400) #define SIZE_INSERT_STD (3 * 50) @@ -420,7 +420,9 @@ bool update_frequency_table( } STAT_RECORD_STEP(STAT_UPDATE_FREQUENCY_TABLE, 1); + #if DEBUG_READ_MAPPING write_read_mapping_from_backtrack(ref_genome->seq_name[result_tab[pos].coord.seq_nr], genome_pos, backtrack_end, read, num); + #endif STAT_RECORD_STEP(STAT_UPDATE_FREQUENCY_TABLE, 2); diff --git a/host/src/processread.c.old b/host/src/processread.c.old new file mode 100644 index 0000000..a7303fd --- /dev/null +++ b/host/src/processread.c.old @@ -0,0 +1,1099 @@ +/** + * Copyright 2016-2019 - Dominique Lavenier & UPMEM + */ + +#include +#include +#include +#include +#include +#include + +#include "accumulateread.h" +#include "common.h" +#include "debug.h" +#include "genome.h" +#include "getread.h" +#include "processread.h" +#include "mapping_file.h" +#include "upvc.h" +#include "vartree.h" + +#define DEBUG_READ_MAPPING true + +#define SIZE_INSERT_MEAN (400) +#define SIZE_INSERT_STD (3 * 50) + +#define CODE_A 0 /* ('A'>>1)&3 41H 0100 0001 */ +#define CODE_C 1 /* ('C'>>1)&3 43H 0100 0011 */ +#define CODE_T 2 /* ('T'>>1)&3 54H 0101 0100 */ +#define CODE_G 3 /* ('G'>>1)&3 47H 0100 0111 */ + +#define PQD_INIT_VAL (99) + +#if SIZE_READ>120 +#define MAX_SUBSTITUTION 31 +#else +#define MAX_SUBSTITUTION 20 +#endif + +static bool flag_dbg = false; + +typedef struct { + int type; + int ix; + int jx; +} backtrack_t; + +static int min(int a, int b) { return a < b ? a : b; } + +static void DPD_compute( + int s1, int s2, int *Dij, int Dijm, int Dimj, int Dimjm, int *Pij, int Pijm, int *Qij, int Qimj, int *xij) +{ + int min_QP, d; + + *Pij = min(Dijm + COST_GAPO, Pijm + COST_GAPE); + *Qij = min(Dimj + COST_GAPO, Qimj + COST_GAPE); + *xij = 0; + + int x; + if (*Pij < *Qij) { + min_QP = *Pij; + x = 2; + } else { + min_QP = *Qij; + x = 3; + } + d = Dimjm; + if ((s1 & 3) != (s2 & 3)) { + d += COST_SUB; + *xij = 1; + } + if (d < min_QP) { + *Dij = d; + } else { + *Dij = min_QP; + *xij = x; + } +} + +int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_symbols) +{ + int matrix_size = size_neighbour_in_symbols + 1; + int diagonal = (NB_DIAG / 2) + 1; + int D[matrix_size][matrix_size]; + int P[matrix_size][matrix_size]; + int Q[matrix_size][matrix_size]; + int X[matrix_size][matrix_size]; + int min_score = PQD_INIT_VAL; + int min_score_i_idx = 0; + int min_score_j_idx = 0; + int align_distance = 1; + + for (int i = 0; i < matrix_size; i++) { + for (int j = 0; j < matrix_size; j++) { + D[i][j] = 0; + } + } + for (int i = 0; i <= diagonal; i++) { + P[i][0] = PQD_INIT_VAL; + P[0][i] = PQD_INIT_VAL; + Q[i][0] = PQD_INIT_VAL; + Q[0][i] = PQD_INIT_VAL; + D[i][0] = i * COST_SUB; + D[0][i] = i * COST_SUB; + } + + for (int i = 1; i < diagonal; i++) { + for (int j = 1; j < i + diagonal; j++) { + DPD_compute(s1[i - 1], s2[j - 1], &D[i][j], D[i][j - 1], D[i - 1][j], D[i - 1][j - 1], &P[i][j], P[i][j - 1], + &Q[i][j], Q[i - 1][j], &X[i][j]); + } + Q[i][i + diagonal] = PQD_INIT_VAL; + D[i][i + diagonal] = PQD_INIT_VAL; + } + for (int i = diagonal; i < matrix_size - diagonal; i++) { + P[i][i - diagonal] = PQD_INIT_VAL; + D[i][i - diagonal] = PQD_INIT_VAL; + for (int j = i - diagonal + 1; j < i + diagonal; j++) { + DPD_compute(s1[i - 1], s2[j - 1], &D[i][j], D[i][j - 1], D[i - 1][j], D[i - 1][j - 1], &P[i][j], P[i][j - 1], + &Q[i][j], Q[i - 1][j], &X[i][j]); + } + Q[i][i + diagonal] = PQD_INIT_VAL; + D[i][i + diagonal] = PQD_INIT_VAL; + } + + for (int i = matrix_size - diagonal; i < matrix_size; i++) { + P[i][i - diagonal] = PQD_INIT_VAL; + D[i][i - diagonal] = PQD_INIT_VAL; + for (int j = i - diagonal + 1; j < matrix_size; j++) { + DPD_compute(s1[i - 1], s2[j - 1], &D[i][j], D[i][j - 1], D[i - 1][j], D[i - 1][j - 1], &P[i][j], P[i][j - 1], + &Q[i][j], Q[i - 1][j], &X[i][j]); + } + if (D[i][matrix_size - 1] < min_score) { + min_score = D[i][matrix_size - 1]; + min_score_i_idx = i; + min_score_j_idx = matrix_size - 1; + } + } + for (int j = matrix_size - diagonal; j < matrix_size; j++) { + if (D[matrix_size - 1][j] < min_score) { + min_score = D[matrix_size - 1][j]; + min_score_i_idx = matrix_size - 1; + min_score_j_idx = j; + } + } + + { + int i = min_score_i_idx; + int j = min_score_j_idx; + backtrack[0].type = CODE_END; + while ((i > 0) && (j > 0)) { + if(X[i][j] == 0) { + i--; + j--; + } else { + if(X[i][j] == 1) { + i--; + j--; + backtrack[align_distance].type = CODE_SUB; + backtrack[align_distance].ix = i; + backtrack[align_distance].jx = j; + align_distance++; + } else { + if(X[i][j] == 2) { + j--; + backtrack[align_distance].type = CODE_INS; + backtrack[align_distance].ix = i; + backtrack[align_distance].jx = j; + align_distance++; + } + else { + i--; + backtrack[align_distance].type = CODE_DEL; + backtrack[align_distance].ix = i; + backtrack[align_distance].jx = j; + align_distance++; + } + } + } + } + } + + return align_distance; +} + +/* + * encoding of differences between read and sequence of the reference genome. + * coding: substitution CODE_SUB pos x + * deletion CODE_DEL pos x+ + * insertion CODE_INS pos x+ + * end CODE_END + * + * x = A | C | G | T + * x+ = a sequence of at least 1 element (i.e. A, C, G ou T) + * pos = integer (8 bits) : give the offset of the variant from the start of the read + * + * example S 12 A D 56 A T G I 87 T C X ==> substitution (A) position 12, deletion (ATG) position 56, insertion (TC) position 87 + * The code is return in "code" as a table of int8_t + */ + +#ifdef USE_INDEL +static int code_alignment(uint8_t *code, int score, int8_t *gen, int8_t *read, unsigned size_neighbour_in_symbols, bool *flag) +{ + int code_idx, computed_score, backtrack_idx; + int size_read = SIZE_READ; + int size_neighbour = size_neighbour_in_symbols; + backtrack_t backtrak[size_read]; + + *flag = false; + + if (score == 0) { + code[0] = CODE_END; + return 1; + } + + /* First, looking for subsititution only */ + code_idx = 0; + computed_score = 0; + for (int i = SIZE_SEED; i < size_neighbour + SIZE_SEED; i++) { + if ((gen[i] & 3) != read[i]) { + computed_score += COST_SUB; + code[code_idx++] = CODE_SUB; + code[code_idx++] = i; + code[code_idx++] = read[i]; + if (computed_score > score) { + break; + } + } + } + code[code_idx++] = CODE_END; + if (computed_score == score) + return code_idx; + + /* Otherwise, re-compute the matrix (only some diagonals) and put in backtrack the path */ + backtrack_idx = DPD(gen, read, backtrak, size_neighbour_in_symbols + SIZE_SEED); + if (backtrack_idx == -1) { + code[0] = CODE_ERR; + return 1; + } + + backtrack_idx--; + code_idx = 0; + while (backtrack_idx > 0) { + if (backtrak[backtrack_idx].type == CODE_SUB) { + code[code_idx++] = CODE_SUB; + code[code_idx++] = backtrak[backtrack_idx].jx - 1; + code[code_idx++] = read[backtrak[backtrack_idx].jx - 1]; + backtrack_idx--; + } else { + if (backtrak[backtrack_idx].type == CODE_DEL) { + int backtrack_jx = backtrak[backtrack_idx].jx; + code[code_idx++] = CODE_DEL; + code[code_idx++] = backtrak[backtrack_idx].ix; + code[code_idx++] = gen[backtrak[backtrack_idx].ix] & 3; + backtrack_idx--; + while ((backtrak[backtrack_idx].type == CODE_DEL) && (backtrack_jx == backtrak[backtrack_idx].jx)) { + code[code_idx++] = gen[backtrak[backtrack_idx].ix] & 3; + backtrack_idx--; + } + } else { + int backtrack_ix = backtrak[backtrack_idx].ix; + code[code_idx++] = CODE_INS; + code[code_idx++] = backtrak[backtrack_idx].jx - 1; + code[code_idx++] = read[backtrak[backtrack_idx].jx]; + backtrack_idx--; + while ((backtrak[backtrack_idx].type == CODE_INS) && (backtrack_ix == backtrak[backtrack_idx].ix)) { + code[code_idx++] = read[backtrak[backtrack_idx].jx]; + backtrack_idx--; + } + } + } + } + code[code_idx++] = CODE_END; + return code_idx; +} +#endif + +#if 0 +static void set_variant( + dpu_result_out_t result_match, genome_t *ref_genome, int8_t *reads_buffer, unsigned int size_neighbour_in_symbols) +{ + uint32_t code_result_idx; + uint8_t code_result_tab[256]; + int8_t *read; + char nucleotide[4] = { 'A', 'C', 'T', 'G' }; + uint64_t genome_pos = ref_genome->pt_seq[result_match.coord.seq_nr] + result_match.coord.seed_nr; + int size_read = SIZE_READ; + //LOG_TRACE("set_variant called\n"); + + /* Get the differences betweend the read and the sequence of the reference genome that match */ + read = &reads_buffer[result_match.num * size_read]; + code_alignment(code_result_tab, result_match.score, &ref_genome->data[genome_pos], read, size_neighbour_in_symbols); + if (code_result_tab[0] == CODE_ERR) + return; + + /* Update "mapping_coverage" with the number of reads that match at this position of the genome */ + for (int i = 0; i < size_read; i++) { + ref_genome->mapping_coverage[genome_pos + i] += 1; + } + +#if DEBUG_READ_MAPPING + // TODO: check genome_pos is the expected value + write_read_mapping(genome_pos, code_result_tab); +#endif + + code_result_idx = 0; + while (code_result_tab[code_result_idx] != CODE_END) { + int code_result = code_result_tab[code_result_idx]; + //LOG_DEBUG("code_result=%d\n", code_result); + int64_t pos_variant_read = code_result_tab[code_result_idx + 1]; + int64_t pos_variant_genome = genome_pos + pos_variant_read; + int ref_pos = 0; + int alt_pos = 0; + variant_t *newvar = (variant_t *)malloc(sizeof(variant_t)); + newvar->depth = 1; + newvar->score = result_match.score; + newvar->next = NULL; + if (code_result == CODE_SUB) { + /* SNP = 0,1,2,3 (code A,C,T,G) */ + int snp = code_result_tab[code_result_idx + 2]; + newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3]; + newvar->alt[alt_pos++] = nucleotide[snp & 3]; + + code_result_idx += 3; + } else if (code_result == CODE_INS) { + int64_t ps_var_genome = pos_variant_genome; + int64_t ps_var_read = pos_variant_read; + code_result_idx += 2; + + while (code_result_tab[code_result_idx] < 4) { + ps_var_read++; + code_result_idx++; + } + + while (ref_genome->data[ps_var_genome] == read[ps_var_read]) { + ps_var_genome--; + ps_var_read--; + pos_variant_genome--; + pos_variant_read--; + } + + newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3]; + + while (pos_variant_read <= ps_var_read) { + newvar->alt[alt_pos++] = nucleotide[read[pos_variant_read] & 3]; + if (alt_pos >= MAX_SIZE_ALLELE - 1) { + free(newvar); + return; + } + pos_variant_read++; + } + + } else if (code_result == CODE_DEL) { + int64_t ps_var_genome = pos_variant_genome; + int64_t ps_var_read = pos_variant_read; + code_result_idx += 2; + + while (code_result_tab[code_result_idx] < 4) { + ps_var_genome++; + code_result_idx++; + } + + while (ref_genome->data[ps_var_genome] == read[ps_var_read]) { + ps_var_read--; + ps_var_genome--; + pos_variant_genome--; + pos_variant_read--; + } + + newvar->alt[alt_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3]; + + while (pos_variant_genome <= ps_var_genome) { + newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3]; + if (ref_pos >= MAX_SIZE_ALLELE - 1) { + free(newvar); + //LOG_TRACE("set_variant early return\n"); + return; + } + pos_variant_genome++; + } + pos_variant_genome -= ref_pos; + } + newvar->ref[ref_pos] = '\0'; + newvar->alt[alt_pos] = '\0'; + variant_tree_insert( + newvar, result_match.coord.seq_nr, pos_variant_genome + 1 - ref_genome->pt_seq[result_match.coord.seq_nr]); + } + //LOG_TRACE("set_variant return\n"); +} +#endif + +static pthread_mutex_t non_mapped_mutex; +static void add_to_non_mapped_read(int numread, int round, FILE *fpe1, FILE *fpe2, int8_t *reads_buffer) +{ + if (fpe1 == NULL || fpe2 == NULL) + return; + pthread_mutex_lock(&non_mapped_mutex); + char nucleotide[4] = { 'A', 'C', 'T', 'G' }; + int size_read = SIZE_READ; + int8_t *read = &reads_buffer[numread * size_read]; + fprintf(fpe1, ">>%d\n", SIZE_SEED * (round + 1)); + for (int j = SIZE_SEED; j < size_read; j++) { + fprintf(fpe1, "%c", nucleotide[read[j] & 3]); + } + for (int j = 0; j < SIZE_SEED; j++) { + fprintf(fpe1, "A"); + } + fprintf(fpe1, "\n"); + read = &reads_buffer[(numread + 2) * size_read]; + fprintf(fpe2, ">>%d\n", SIZE_SEED * (round + 1)); + for (int j = SIZE_SEED; j < size_read; j++) { + fprintf(fpe2, "%c", nucleotide[read[j] & 3]); + } + for (int j = 0; j < SIZE_SEED; j++) { + fprintf(fpe2, "A"); + } + fprintf(fpe2, "\n"); + pthread_mutex_unlock(&non_mapped_mutex); +} + +#ifdef USE_INDEL +int get_read_update_positions( + uint64_t * update_genome_position, + dpu_result_out_t *result_tab, + int pos, + genome_t *ref_genome, + uint64_t genome_pos, + int8_t *read, + __attribute__((unused))int size_neighbour_in_symbols, + bool * flag, + bool debug, + uint32_t * substCnt, + char * chromosome_name) { + + // run smith and waterman algorithm to find indels + uint8_t code_result_tab[256]; + code_alignment(code_result_tab, result_tab[pos].score, &ref_genome->data[genome_pos], read, size_neighbour_in_symbols, flag); + write_read_mapping(chromosome_name, result_tab[pos].coord.seed_nr, code_result_tab, (uint8_t*) read); + for(int read_pos = 0; read_pos < SIZE_READ; ++read_pos) { + update_genome_position[read_pos] = 0; + } + if (code_result_tab[0] != CODE_ERR) { + + // array that contains for each read position, the genome position that it matches with + // This is the genome position that will be updated in the frequency table + // This genome position takes into account the shift due to possible indels found + // with smith-waterman algorithm + int code_result_index = 0; + int ref_pos = 0; + int nbIndels = 0; + bool ins = false; + while (code_result_tab[code_result_index] != CODE_END) { + int code_result = code_result_tab[code_result_index]; + int64_t pos_variant_read = code_result_tab[code_result_index + 1]; + /*printf("pos variant: %lu\n", pos_variant_read);*/ + int64_t pos_variant_genome = genome_pos + pos_variant_read; + if (code_result == CODE_SUB) { + // do nothing for substitution + code_result_index += 3; + (*substCnt)++; + ref_pos++; + } + else if (code_result == CODE_INS) { + ins = true; + int64_t ps_var_genome = pos_variant_genome; + int64_t ps_var_read = pos_variant_read; + code_result_index += 2; + + while (code_result_tab[code_result_index] < 4) { + ps_var_read++; + code_result_index++; + } + + while (ref_genome->data[ps_var_genome] == read[ps_var_read] && ps_var_genome + && pos_variant_read) { + assert(ps_var_genome && ps_var_read && pos_variant_genome && pos_variant_read); + ps_var_genome--; + ps_var_read--; + pos_variant_genome--; + pos_variant_read--; + } + + /*newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3];*/ + ref_pos++; + + // skip first value which should be the equivalent of first element in ref genome + pos_variant_read++; + while (pos_variant_read <= ps_var_read) { + // position should not be updated yet + if(update_genome_position[pos_variant_read] != 0) { + LOG_WARN("duplicate update (Insertion) at position %lu. Current %lu\n", + pos_variant_read, update_genome_position[pos_variant_read]); + fflush(stdout); + return -1; + } + update_genome_position[pos_variant_read++] = UINT64_MAX; + } + ++nbIndels; + } + else if (code_result == CODE_DEL) { + + int64_t ps_var_genome = pos_variant_genome; + int64_t ps_var_read = pos_variant_read; + code_result_index += 2; + + while (code_result_tab[code_result_index] < 4) { + ps_var_genome++; + code_result_index++; + } + + while (ref_genome->data[ps_var_genome] == read[ps_var_read] && pos_variant_genome && ps_var_read) { + assert(ps_var_genome && ps_var_read && pos_variant_genome && pos_variant_read); + ps_var_read--; + ps_var_genome--; + pos_variant_genome--; + pos_variant_read--; + } + + // on a deletion store the threshold to apply from the current read position + assert(ps_var_genome > pos_variant_genome); + if(pos_variant_read + 1 < SIZE_READ) { + // position should not be updated yet + if(update_genome_position[pos_variant_read+1] != 0) { + LOG_WARN("duplicate update (Deletion) at position %lu. Current %lu\n", + pos_variant_read+1, update_genome_position[pos_variant_read+1]); + fflush(stdout); + return -1; + } + /*assert(update_genome_position[pos_variant_read+1] == 0);*/ + update_genome_position[pos_variant_read + 1] = ps_var_genome - pos_variant_genome; + } + while (pos_variant_genome <= ps_var_genome) { + pos_variant_genome++; + ref_pos++; + } + pos_variant_genome -= ref_pos; + ++nbIndels; + } + else + assert(0); + } + + // debug prints + if(nbIndels && debug) + printf("SW algorithm (nbIndels %d) ins %d:\n", nbIndels, ins); + int64_t curr_pos = genome_pos; + for(int read_pos = 0; read_pos < SIZE_READ; ++read_pos) { + switch(update_genome_position[read_pos]) { + case 0: + update_genome_position[read_pos] = curr_pos++; + if(nbIndels && debug) + printf(" "); + break; + case UINT64_MAX: + if(nbIndels && debug) + printf("I"); + break; + default: + if(nbIndels && debug) { + for(uint64_t print_index = 0; print_index < update_genome_position[read_pos]; ++print_index) + printf("D"); + } + curr_pos += update_genome_position[read_pos]; + update_genome_position[read_pos] = curr_pos++; + } + } + + if(nbIndels && debug) { + printf("\n"); + fflush(stdout); + } + return nbIndels; + } + else + assert(0); + + return false; +} +#endif + + +static pthread_mutex_t freq_table_mutex; + +/** + * function to update frequency table used for variant calling + **/ +bool update_frequency_table( + genome_t *ref_genome, + dpu_result_out_t *result_tab, + int8_t *reads_buffer, + float *reads_quality_buffer, + int pos, + float mapq, + __attribute__((unused))int size_neighbour_in_symbols) { + + struct frequency_info **frequency_table = get_frequency_table(); + uint64_t genome_pos = ref_genome->pt_seq[result_tab[pos].coord.seq_nr] + result_tab[pos].coord.seed_nr; + int num = result_tab[pos].num; + int8_t *read = reads_buffer + (num * SIZE_READ); + float *read_quality = reads_quality_buffer + (num/2 * SIZE_READ); + //TODO: read the quality in the correct order (inverted or not) + bool inv = num & 1; + //TODO: assume no offset here + +#ifdef USE_INDEL + + static bool debug = false; + static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; + uint64_t update_genome_position[SIZE_READ]; + uint32_t substCnt = 0; + flag_dbg = false; + + // for simplicity put all this in a critical section protected by a mutex + // since the frequency table is shared (but inefficient) + + pthread_mutex_lock(&freq_table_mutex); + int nbIndels = get_read_update_positions(update_genome_position, result_tab, pos, + ref_genome, genome_pos, read, size_neighbour_in_symbols, &flag_dbg, debug, &substCnt, ref_genome->seq_name[result_tab[pos].coord.seq_nr]); + bool hasIndel = nbIndels > 0; + + // debug prints + if(hasIndel && debug) { + + printf("Read:\n"); + for(int k = 0; k < SIZE_READ; ++k) { + printf("%c", nucleotide[read[k]]); + } + printf("\ngenome (pos %u:%u):\n", result_tab[pos].coord.seed_nr, result_tab[pos].coord.seed_nr+SIZE_READ); + for(uint64_t k = genome_pos; k < genome_pos + SIZE_READ; ++k) { + printf("%c", nucleotide[ref_genome->data[k]]); + } + printf("\nupdate pos:\n"); + uint64_t lastpos = 0; + for(uint64_t k = 0; k < SIZE_READ; ++k) { + if(k && update_genome_position[k] != lastpos+1) { + if(update_genome_position[k] == UINT64_MAX) + printf("X"); + /*printf("No update at position %lu\n", k);*/ + else if(lastpos == UINT64_MAX) + /*printf("New start at pos %lu = %lu / %c\n", k, update_genome_position[k], nucleotide[ref_genome->data[update_genome_position[k]]]);*/ + printf("%c", nucleotide[ref_genome->data[update_genome_position[k]]]); + else + /*printf("Change at pos %lu, diff %ld, %c\n", k, update_genome_position[k] - lastpos, nucleotide[ref_genome->data[update_genome_position[k]]]);*/ + printf("%c", nucleotide[ref_genome->data[update_genome_position[k]]]); + } + /*else if(nucleotide[ref_genome->data[update_genome_position[k]]] != nucleotide[read[k]]) {*/ + /*printf(RED "%c" RESET, nucleotide[ref_genome->data[update_genome_position[k]]]);*/ + /*}*/ + else + printf("%c", nucleotide[ref_genome->data[update_genome_position[k]]]); + lastpos = update_genome_position[k]; + } + + printf("\nsubst:\n"); + for(uint64_t k = 0; k < SIZE_READ; ++k) { + if(update_genome_position[k] == UINT64_MAX) { + printf(" "); + continue; + } + else if(nucleotide[ref_genome->data[update_genome_position[k]]] != nucleotide[read[k]]) { + printf("U"); + substCnt++; + } + else + printf(" "); + } + printf("\n\n"); + fflush(stdout); + assert(!result_tab[pos].coord.nodp); + } + else if(debug) { + if(!result_tab[pos].coord.nodp) { + LOG_WARN("odpd result with no indels detected (flag = %d, subst cnt %u)):\n", flag_dbg, substCnt); + printf("Read:\n"); + for(int k = 0; k < SIZE_READ; ++k) { + printf("%c", nucleotide[read[k]]); + } + printf("\ngenome (pos %u:%u):\n", result_tab[pos].coord.seed_nr, result_tab[pos].coord.seed_nr+SIZE_READ); + for(uint64_t k = genome_pos; k < genome_pos + SIZE_READ; ++k) { + printf("%c", nucleotide[ref_genome->data[k]]); + } + printf("\n\n"); + fflush(stdout); + } + } + + /*pthread_mutex_lock(&freq_table_mutex);*/ + // for the moment support only one indel otherwise we have some issue FIXME + if(substCnt <= MAX_SUBSTITUTION && (hasIndel || result_tab[pos].coord.nodp) && nbIndels >= 0) { + for(uint64_t k = 0; k < SIZE_READ; ++k) { + uint64_t update_genome_pos = update_genome_position[k]; + if(update_genome_pos < genome_get()->fasta_file_size) { + frequency_table[read[k]][update_genome_pos].freq += mapq * read_quality[inv ? SIZE_READ - k - 1 : k]; + /*frequency_table[read[j]][genome_pos+j].score += result_tab[pos].score;*/ + frequency_table[read[k]][update_genome_pos].score++; + } + else if (update_genome_pos != UINT64_MAX) + LOG_WARN("genome update position computed is wrong %lu\n", update_genome_pos); + } + } + /*fflush(stdout);*/ + pthread_mutex_unlock(&freq_table_mutex); + return hasIndel; + +#else + pthread_mutex_lock(&freq_table_mutex); + for(int j = 0; j < SIZE_READ; ++j) { + if(genome_pos + j < genome_get()->fasta_file_size) { + frequency_table[read[j]][genome_pos+j].freq += mapq * read_quality[inv ? SIZE_READ - j - 1 : j]; + frequency_table[read[j]][genome_pos+j].score++; + } + else + LOG_WARN("reads matched at position that exceeds genome size\n"); + } + pthread_mutex_unlock(&freq_table_mutex); + return false; +#endif +} + +static volatile unsigned int curr_match; +static pthread_mutex_t curr_match_mutex; +unsigned int acquire_curr_match() +{ + pthread_mutex_lock(&curr_match_mutex); + return curr_match; +} +void release_curr_match(unsigned int new_curr_match) +{ + curr_match = new_curr_match; + pthread_mutex_unlock(&curr_match_mutex); + return; +} + +static pthread_barrier_t barrier; + +typedef struct { + unsigned int nb_match; + dpu_result_out_t *result_tab; + int round; + int8_t *reads_buffer; + float *reads_quality_buffer; + genome_t *ref_genome; + FILE *fpe1; + FILE *fpe2; +} process_read_arg_t; + +static uint64_t nr_reads_total = 0ULL; +static uint64_t nr_reads_total_from_dpus = 0ULL; +static uint64_t nr_reads_non_mapped = 0ULL; +static uint64_t nr_reads_with_indels = 0ULL; +static pthread_mutex_t nr_reads_mutex = PTHREAD_MUTEX_INITIALIZER; + +#define MISMATCH_COUNT(X) (X.score / 10) +#define INVALID_SCORE 1000 + +static void keep_best_2_scores(unsigned score, unsigned* P1, unsigned *P2, unsigned x1, unsigned x2, unsigned* best_score) { + + if(score < best_score[0]) { + + // move current to next position + best_score[1] = best_score[0]; + P1[1] = P1[0]; + P2[1] = P2[0]; + // update first position + best_score[0] = score; + P1[0] = x1; + P2[0] = x2; + } + else if (score < best_score[1]) { + + // update second position + best_score[1] = score; + P1[1] = x1; + P2[1] = x2; + } +} + +static unsigned get_nb_scores(unsigned int * best_score) { + + unsigned np = 0; + if(best_score[0] < INVALID_SCORE) { + np++; + if(best_score[1] < INVALID_SCORE) np++; + } + return np; +} + +/*#define USE_MAPQ_SCORE*/ +static void do_process_read(process_read_arg_t *arg) +{ + const unsigned int nb_match = arg->nb_match; + dpu_result_out_t *result_tab = arg->result_tab; + int round = arg->round; + int8_t *reads_buffer = arg->reads_buffer; + float *reads_quality_buffer = arg->reads_quality_buffer; + genome_t *ref_genome = arg->ref_genome; + FILE *fpe1 = arg->fpe1; + FILE *fpe2 = arg->fpe2; + unsigned int size_neighbour_in_symbols = (SIZE_NEIGHBOUR_IN_BYTES - DELTA_NEIGHBOUR(round)) * 4; + /*printf("size_neighbour_in_symbols : %u", size_neighbour_in_symbols);*/ + + /* + * The number of a pair is given by "num_read / 4 " (see dispatch_read function) + * Their type is given by their offset (see dispatch_read function) + * type = num_read%4 == 0 ==> read 1 + * 1 ==> read 1 complement + * 2 ==> read 2 + * 3 ==> read 2 complement + * The read pair to consider are [0, 3] and [1, 2]. + * + * NEW (04/2020): + * - more paired reads are considered + * - when different position mapping are possible, choose the less covered zone + */ + + while (true) { + unsigned int i; + if ((i = acquire_curr_match()) >= nb_match) { + release_curr_match(i); + return; + } + int numpair = result_tab[i].num / 4; + unsigned int j = i; + while ((j < nb_match) && (numpair == result_tab[j].num / 4)) { + j++; + } + release_curr_match(j); + + // i = start index in result_tab + // j = stop index in result_tab + // select best couples of paired reads + unsigned int P1[2]; + unsigned int P2[2]; + unsigned int pos1, pos2, t1, t2; + unsigned int best_score[2] = { 1000, 1000 }; + /*unsigned int best_score_all = 1000;*/ + // test all significant pairs of reads (0,3) & (1,2) + for (unsigned int x1 = i; x1 < j; x1++) { + t1 = result_tab[x1].num % 4; + pos1 = result_tab[x1].coord.seed_nr; + for (unsigned int x2 = i + 1; x2 < j; x2++) { + pos2 = result_tab[x2].coord.seed_nr; + t2 = result_tab[x2].num % 4; + if (t1 + t2 == 3) // select significant pair + { + if ((abs((int)pos2 - (int)pos1) > READ_DIST_LOWER_BOUND && (abs((int)pos2 - (int)pos1) < READ_DIST_UPPER_BOUND))) { + // update if this is one of the two best scores + keep_best_2_scores(result_tab[x1].score + result_tab[x2].score, P1, P2, x1, x2, best_score); + } + } + } + } + + bool update = false; + bool hasIndel = false; + + unsigned np = get_nb_scores(best_score); + if (np > 0) { + + if(np == 2) { + + // found at least 2 matching pairs of positions. Check the delta between the two pairs to + // decide whether we should keep the best pair + int delta = abs((int)(MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]])) + - (int)(MISMATCH_COUNT(result_tab[P1[1]]) + MISMATCH_COUNT(result_tab[P2[1]]))); + + float mapq = 1.0f; +#ifdef USE_MAPQ_SCORE + int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + + MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((2 * (MAX_SUBSTITUTION + 1)) - delta); + if(delta_corrected < 0) { + LOG_WARN("negative delta for square root %d\n", delta_corrected); + } + else if(delta > DIST_PAIR_THRESHOLD) { + mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); +#else + if(delta > DIST_PAIR_THRESHOLD) { +#endif + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); + update = true; + } + } + else if(np) { // only one result, take it + int delta = abs((int)(MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]])) - (2 * (MAX_SUBSTITUTION + 1))); + float mapq = 1.0f; +#ifdef USE_MAPQ_SCORE + int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((2 * (MAX_SUBSTITUTION + 1)) - delta); + if(delta_corrected < 0) { + LOG_WARN("negative delta (np == 1) for square root %d\n", delta_corrected); + } + else if(delta > DIST_PAIR_THRESHOLD) { + mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); +#else + if(delta > DIST_PAIR_THRESHOLD) { +#endif + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); + update = true; + } + } + } + if(true) { + + // check mapping of R1 and R2 independently + unsigned int best_score_R1[2] = { 1000, 1000 }; + unsigned int best_score_R2[2] = { 1000, 1000 }; + P1[0] = 0; + P2[0] = 0; + P1[1] = 0; + P2[1] = 0; + for (unsigned int read = i; read < j; read++) { + unsigned t1 = result_tab[read].num % 4; + if(t1 < 2) { // PE1 or RPE1 + keep_best_2_scores(result_tab[read].score, P1, P2, read, 0, best_score_R1); + } + else { // PE2 or RPE2 + keep_best_2_scores(result_tab[read].score, P1, P2, 0, read, best_score_R2); + } + } + + unsigned np1 = get_nb_scores(best_score_R1), np2 = get_nb_scores(best_score_R2); + if(np1 == 2) { + + int delta = abs((int)MISMATCH_COUNT(result_tab[P1[0]]) - (int)MISMATCH_COUNT(result_tab[P1[1]])); + + float mapq = 1.0f; +#ifdef USE_MAPQ_SCORE + int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); + + if(delta_corrected < 0) { + LOG_WARN("negative delta (np1 == 2) for square root %d\n", delta_corrected); + } + else if(delta > DIST_SINGLE_THRESHOLD) { + mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); +#else + if(delta > DIST_SINGLE_THRESHOLD) { +#endif + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); + update = true; + } + } + else if(np1) { + int delta = abs((int)MISMATCH_COUNT(result_tab[P1[0]]) - (MAX_SUBSTITUTION + 1)); + + float mapq = 1.0f; +#ifdef USE_MAPQ_SCORE + int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); + + if(delta_corrected < 0) { + LOG_WARN("negative delta (np1 == 1) for square root %d\n", delta_corrected); + } + else if(delta > DIST_SINGLE_THRESHOLD) { + mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); +#else + if(delta > DIST_SINGLE_THRESHOLD) { +#endif + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); + update = true; + } + } + + if(np2 == 2) { + + int delta = abs((int)MISMATCH_COUNT(result_tab[P2[0]]) - (int)MISMATCH_COUNT(result_tab[P2[1]])); + + float mapq = 1.0f; +#ifdef USE_MAPQ_SCORE + int delta_corrected = MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); + + if(delta_corrected < 0) { + LOG_WARN("negative delta (np2 == 2) for square root %d, %d %d %d %d\n", + delta_corrected, MISMATCH_COUNT(result_tab[P2[0]]), MISMATCH_COUNT(result_tab[P2[1]]), MAX_SUBSTITUTION + 1, delta); + } + else if(delta > DIST_SINGLE_THRESHOLD) { + mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); +#else + if(delta > DIST_SINGLE_THRESHOLD) { +#endif + + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); + update = true; + } + } + else if(np2) { + int delta = abs((int)MISMATCH_COUNT(result_tab[P2[0]]) - (MAX_SUBSTITUTION + 1)); + + float mapq = 1.0f; +#ifdef USE_MAPQ_SCORE + int delta_corrected = MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); + + if(delta_corrected < 0) { + LOG_WARN("negative delta (np2 == 1) for square root %d\n", delta_corrected); + } + else if (delta > DIST_SINGLE_THRESHOLD) { + mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); +#else + if (delta > DIST_SINGLE_THRESHOLD) { +#endif + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); + update = true; + } + } + if(!update) { + pthread_mutex_lock(&nr_reads_mutex); + nr_reads_non_mapped++; + pthread_mutex_unlock(&nr_reads_mutex); + add_to_non_mapped_read(numpair * 4, round, fpe1, fpe2, reads_buffer); + } + if(hasIndel) + nr_reads_with_indels++; + pthread_mutex_lock(&nr_reads_mutex); + nr_reads_total_from_dpus++; + pthread_mutex_unlock(&nr_reads_mutex); + } + } +} + +#define PROCESS_READ_THREAD (8) +#define PROCESS_READ_THREAD_SLAVE (PROCESS_READ_THREAD - 1) +static process_read_arg_t args; +static pthread_t thread_id[PROCESS_READ_THREAD_SLAVE]; +static bool stop_threads = false; + +void process_read(FILE *fpe1, FILE *fpe2, int round, unsigned int pass_id) +{ + int8_t *reads_buffer = get_reads_buffer(pass_id); + float *reads_quality_buffer = get_reads_quality_buffer(pass_id); + acc_results_t acc_res = accumulate_get_result(pass_id); + nr_reads_total += get_reads_in_buffer(pass_id) / 4; + + curr_match = 0; + + args.nb_match = acc_res.nb_res; + args.result_tab = acc_res.results; + args.round = round; + args.reads_buffer = reads_buffer; + args.reads_quality_buffer = reads_quality_buffer; + args.fpe1 = fpe1; + args.fpe2 = fpe2; + + pthread_barrier_wait(&barrier); + do_process_read(&args); + pthread_barrier_wait(&barrier); + + free(acc_res.results); +} + +static void *process_read_thread_fct(void *arg) +{ + pthread_barrier_wait(&barrier); + while (!stop_threads) { + do_process_read(arg); + pthread_barrier_wait(&barrier); + pthread_barrier_wait(&barrier); + } + return NULL; +} + +void process_read_init() +{ +#if DEBUG_READ_MAPPING + open_mapping_file(); +#endif + genome_t *ref_genome = genome_get(); + args.ref_genome = ref_genome; + + assert(pthread_mutex_init(&curr_match_mutex, NULL) == 0); + assert(pthread_mutex_init(&non_mapped_mutex, NULL) == 0); + assert(pthread_mutex_init(&freq_table_mutex, NULL) == 0); + assert(pthread_barrier_init(&barrier, NULL, PROCESS_READ_THREAD) == 0); + + for (unsigned int each_thread = 0; each_thread < PROCESS_READ_THREAD_SLAVE; each_thread++) { + assert(pthread_create(&thread_id[each_thread], NULL, process_read_thread_fct, &args) == 0); + } +} + +void process_read_free() +{ +#if DEBUG_READ_MAPPING + close_mapping_file(); +#endif + stop_threads = true; + pthread_barrier_wait(&barrier); + + for (unsigned int each_thread = 0; each_thread < PROCESS_READ_THREAD_SLAVE; each_thread++) { + assert(pthread_join(thread_id[each_thread], NULL) == 0); + } + + assert(pthread_barrier_destroy(&barrier) == 0); + assert(pthread_mutex_destroy(&curr_match_mutex) == 0); + assert(pthread_mutex_destroy(&non_mapped_mutex) == 0); + assert(pthread_mutex_destroy(&freq_table_mutex) == 0); + fflush(stdout); + fprintf(stderr, "%% reads non mapped: %f%%\n", (float)nr_reads_non_mapped * 100.0 / (float)nr_reads_total_from_dpus); + fprintf(stderr, "%% reads with indels: %f%%\n", (float)nr_reads_with_indels * 100.0 / (float)(nr_reads_total_from_dpus - nr_reads_non_mapped)); + fprintf(stderr, "%% Total reads from dpus: %ld%%\n", nr_reads_total_from_dpus); + fprintf(stderr, "%% Total reads: %ld%%\n", nr_reads_total); +} diff --git a/host/src/upvc.c b/host/src/upvc.c index ce29916..1defa0f 100644 --- a/host/src/upvc.c +++ b/host/src/upvc.c @@ -174,7 +174,7 @@ void *thread_process(__attribute__((unused)) void *arg) process_read(fope1, fope2, round, each_pass); clock_gettime(CLOCK_MONOTONIC_RAW, ¤t_time); int spent = current_time.tv_sec-start_time.tv_sec; - printf("time spent: %02dh%02dm%02ds (%ds)\n", spent/3600, spent/60%60, spent%60, spent); + printf("time spent: %02dh%02dm%02ds (%ds), id:%d\n", spent/3600, spent/60%60, spent%60, spent, each_pass); sem_post(&accprocess_to_getreads_sem); sem_wait(&acc_to_process_sem); } diff --git a/host/src/vartree.c b/host/src/vartree.c index 9227e01..c9227a0 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -284,6 +284,13 @@ __attribute__((unused)) uint32_t depth_filter_fixed_3_f15(float freq) { return 3; } +#define AFFINE_B 1.12795388963028 +#define AFFINE_A 0.090970032203921 + +float reverse_filter(uint32_t score) { + return AFFINE_A*(float)score + AFFINE_B; +} + FILE * dbg_file = NULL; FILE * sub_file = NULL; @@ -296,19 +303,19 @@ static variant_t ** get_most_frequent_variant(genome_t * ref_genome, struct freq variant_t** results = calloc(5, sizeof(variant_t*)); float total = 0; for(int i = 0; i < 5; ++i) { - total += frequency_table[i][genome_pos].freq; + total += frequency_table[i][genome_pos].score; } if(total == 0) return results; for(int i = 0; i < 5; ++i) { - float freq = frequency_table[i][genome_pos].freq; + //float freq = frequency_table[i][genome_pos].freq; uint32_t score = frequency_table[i][genome_pos].score; if(i == ref_genome->data[genome_pos]) continue; // not a variant if the same nucleotide as in reference genome // if frequency and depth pass the threshold, consider it a variant - if(score >= depth_filter(freq * 100.0 / total)) { + if(score > reverse_filter(total)) { // this is a substitution, create variant variant_t *var = (variant_t *)malloc(sizeof(variant_t)); From 0899449b7b9d022a4f7cfcec856679ea0c8315e9 Mon Sep 17 00:00:00 2001 From: amoisson Date: Fri, 11 Mar 2022 14:38:28 +0100 Subject: [PATCH 31/48] Added a way to dump the frequency table in a bin file and extract parts of it with a python script --- host/src/vartree.c | 115 +++++++++++++++++++++++++++++++++------- tests/read_freq_dump.py | 58 ++++++++++++++++++++ 2 files changed, 154 insertions(+), 19 deletions(-) create mode 100755 tests/read_freq_dump.py diff --git a/host/src/vartree.c b/host/src/vartree.c index c9227a0..2f679e6 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -1,4 +1,4 @@ -/** + /** * Copyright 2016-2019 - Dominique Lavenier & UPMEM */ @@ -14,6 +14,79 @@ #include "parse_args.h" #include "upvc.h" #include "vartree.h" +#include "debug.h" + + +#define DUMP_FREQUENCY_TABLE false +#define FREQ_TABLE_DUMP_FILE_NAME "frequency_table.bin" +#if DUMP_FREQUENCY_TABLE +FILE *freq_table_dump_file; + +void open_freq_table() +{ + static char filename[FILENAME_MAX]; + sprintf(filename, "%s", FREQ_TABLE_DUMP_FILE_NAME); + freq_table_dump_file = fopen(filename, "w"); + if (freq_table_dump_file == NULL) { + LOG_FATAL("couldn't open frequency table dumping file; errno: %u\n", errno); + } +} + +void close_freq_table() +{ + fclose(freq_table_dump_file); +} + +//The pragma pack shouldn't have any effect here as of now but it might have an effect if struct content is modified +//In which case it should probably be kept. +#pragma pack(push,1) +struct freq_table_dump_entry_t{ + float freqs[5]; + uint16_t scores[5]; +}; +#pragma pack(pop) + +// freq_table_dump format: +// 0: uint8_t number of sequences = {n} +// 1 - n*4: uint16_t[n] sequence start addresses +// n*4+1 - end: struct freq_table_dump_entry_t[...] table entries +void write_freq_table_header(genome_t *ref_genome) +{ + uint8_t number_sequences = ref_genome->nb_seq; + uint64_t sequence_offsets[256]; + uint64_t header_size = 1 + sizeof(uint64_t)* (uint64_t) number_sequences; + uint64_t total_seq_length=0; + for(uint8_t seq_number=0; seq_numbernb_seq; seq_number++) { + sequence_offsets[seq_number] = header_size + total_seq_length*sizeof(struct freq_table_dump_entry_t); + total_seq_length += ref_genome->len_seq[seq_number]; + } + fwrite(&number_sequences, 1, 1, freq_table_dump_file); + fwrite(sequence_offsets, sizeof(uint64_t), number_sequences, freq_table_dump_file); +} + +void dump_freq_table_entry(int address, struct frequency_info **frequency_table) +{ + struct freq_table_dump_entry_t entry; + for (int i=0; i<5; i++) { + entry.freqs[i] = frequency_table[i][address].freq; + entry.scores[i] = frequency_table[i][address].score; + } + fwrite(&entry, sizeof(struct freq_table_dump_entry_t), 1, freq_table_dump_file); +} + +void dump_freq_table(genome_t *ref_genome, struct frequency_info **frequency_table) +{ + open_freq_table(); + write_freq_table_header(ref_genome); + for (uint32_t seq_number = 0; seq_number < ref_genome->nb_seq; seq_number++) { + for (uint64_t seq_position = 0; seq_position < ref_genome->len_seq[seq_number]; seq_position++) { + dump_freq_table_entry(ref_genome->pt_seq[seq_number] + seq_position, frequency_table); + } + } + close_freq_table(); +} + +#endif static variant_t **variant_list[MAX_SEQ_GEN] = { NULL }; static pthread_mutex_t mutex; @@ -415,6 +488,10 @@ void create_vcf() fclose(sub_file); #endif + printf("dumping frequency table...\n"); + dump_freq_table(ref_genome, frequency_table); + printf("table done dumping; starting variant calling...\n"); + unsigned int uncovered_nucleotides = 0; unsigned int badly_covered_nucleotides = 0; unsigned int well_covered_nucleotides = 0; @@ -429,28 +506,28 @@ void create_vcf() for (uint64_t seq_position = 0; seq_position < ref_genome->len_seq[seq_number]; seq_position++) { variant_t ** results = get_most_frequent_variant(ref_genome, frequency_table, seq_number, seq_position); - unsigned int total_score = 0; - total_score += frequency_table[0][ref_genome->pt_seq[seq_number] + seq_position].score; - total_score += frequency_table[1][ref_genome->pt_seq[seq_number] + seq_position].score; - total_score += frequency_table[2][ref_genome->pt_seq[seq_number] + seq_position].score; - total_score += frequency_table[3][ref_genome->pt_seq[seq_number] + seq_position].score; - total_coverage += total_score; - //total_score += frequency_table[4][ref_genome->pt_seq[seq_number] + seq_position].score; - if (total_score == 0) { - uncovered_nucleotides++; - } else if (total_score < 10) { - badly_covered_nucleotides++; - } else { - well_covered_nucleotides++; + unsigned int total_score = 0; + total_score += frequency_table[0][ref_genome->pt_seq[seq_number] + seq_position].score; + total_score += frequency_table[1][ref_genome->pt_seq[seq_number] + seq_position].score; + total_score += frequency_table[2][ref_genome->pt_seq[seq_number] + seq_position].score; + total_score += frequency_table[3][ref_genome->pt_seq[seq_number] + seq_position].score; + total_coverage += total_score; + //total_score += frequency_table[4][ref_genome->pt_seq[seq_number] + seq_position].score; + if (total_score == 0) { + uncovered_nucleotides++; + } else if (total_score < 10) { + badly_covered_nucleotides++; + } else { + well_covered_nucleotides++; if (total_score > 50) { - overly_covered_nucleotides++; + overly_covered_nucleotides++; } if (total_score > max_coverage) { - max_coverage = total_score; - chromosome_most_coverage = seq_number; - position_most_coverage = seq_position; + max_coverage = total_score; + chromosome_most_coverage = seq_number; + position_most_coverage = seq_position; } - } + } int nb_var = 0; for(int i = 0; i < 5; ++i) { variant_t * var = results[i]; diff --git a/tests/read_freq_dump.py b/tests/read_freq_dump.py new file mode 100755 index 0000000..f3038a2 --- /dev/null +++ b/tests/read_freq_dump.py @@ -0,0 +1,58 @@ +#! /usr/bin/python3 +import argparse +import itertools +import mmap +import sys +import struct + +class freq_dump: + def __init__(self, file_name): + self.file_object = open(file_name, mode="rb") + self.mmap_freq_dump = mmap.mmap(self.file_object.fileno(), length=0, access=mmap.ACCESS_READ) + self.n_seq = self.mmap_freq_dump.read_byte() + self.seq_starts = [int.from_bytes(self.mmap_freq_dump.read(8), byteorder='little') for _ in range(self.n_seq)] + print(self.n_seq, self.seq_starts) + print([v%30 for v in self.seq_starts]) + + def read_address(self, sequence_id, index): + self.mmap_freq_dump.seek(self.seq_starts[sequence_id] + index*30) + freqs = [struct.unpack('f', self.mmap_freq_dump.read(4))[0] for _ in range(5)] + scores = [int.from_bytes(self.mmap_freq_dump.read(2), byteorder='little') for _ in range(5)] + return freqs, scores + + def __del__(self): + self.close() + + def close(self): + self.mmap_freq_dump.close() + self.file_object.close() + +def dump_around(freq_dump_obj, seq_n, index, context): + for i in range(-context, context+1, 1): + freqs, scores = freq_dump_obj.read_address(seq_n-1, index-i) + #print(seq_n, index+i, " ".join(f'{v:.2f}' for v in freqs), " \t", " ".join(str(v) for v in scores)) + print(seq_n, index+i, "\t".join(f'{v:.2f}' for v in freqs), sep="\t") + +def main(args): + context = args.context + dump_file_name = args.dump_file + freq_dump_obj = freq_dump(dump_file_name) + for address_file_name in args.address_files: + with open(address_file_name, "r") as address_file: + for line in address_file.readlines(): + split_line = line.split(" ") + seq_n = int(split_line[0]) + index = int(split_line[1]) + dump_around(freq_dump_obj, seq_n, index, context) + freq_dump_obj.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="show frequency table dump in human readable format around specific indices in the genome") + parser.add_argument("dump_file", type=str, help="Frequency table bin file to read from") + parser.add_argument("address_files", nargs="+", type=str, help="files containing addresses around which to show frequency table") + parser.add_argument("-C", "--context", default=2, type=int, dest="context", help="how many addresses to show before and after the focused addresses") + try: + main(parser.parse_args()) + except BrokenPipeError: + pass From 2bbb39f0c129cb7da664f44c1a12aea7393cf4c8 Mon Sep 17 00:00:00 2001 From: amoisson Date: Fri, 18 Mar 2022 09:37:39 +0100 Subject: [PATCH 32/48] fixed frequency table dumping --- host/inc/genome.h | 1 + host/src/processread.c | 38 +++++++++++++++++++++++--------------- host/src/vartree.c | 2 ++ tests/read_freq_dump.py | 41 ++++++++++++++++++++++++++++++++--------- 4 files changed, 58 insertions(+), 24 deletions(-) diff --git a/host/inc/genome.h b/host/inc/genome.h index 6bdf35d..7abf27d 100644 --- a/host/inc/genome.h +++ b/host/inc/genome.h @@ -34,6 +34,7 @@ struct frequency_info { float freq; unsigned int score; + unsigned int unsure_score; }; struct frequency_info** get_frequency_table(); void free_frequency_table(); diff --git a/host/src/processread.c b/host/src/processread.c index 5e2066d..1c8cb8a 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -21,7 +21,7 @@ #include "parse_args.h" #include "profiling.h" -#define DEBUG_READ_MAPPING false +#define DEBUG_READ_MAPPING true #define SIZE_INSERT_MEAN (400) #define SIZE_INSERT_STD (3 * 50) @@ -34,7 +34,7 @@ #define PQD_INIT_VAL (999) #if SIZE_READ>120 -#define MAX_SUBSTITUTION 31 +#define MAX_SUBSTITUTION 20 #else #define MAX_SUBSTITUTION 20 #endif @@ -371,7 +371,8 @@ bool update_frequency_table( int8_t *reads_buffer, float *reads_quality_buffer, int pos, - float mapq + float mapq, + bool unsure ) { STAT_RECORD_START(STAT_UPDATE_FREQUENCY_TABLE); @@ -452,6 +453,9 @@ bool update_frequency_table( } frequency_table[read_letter][current_position].freq += mapq * read_quality[invert_read ? SIZE_READ-backtrack_end->jx-1 : backtrack_end->jx]; frequency_table[read_letter][current_position].score++; + if (unsure) { + frequency_table[read_letter][current_position].unsure_score++; + } break; case CODE_INS: case CODE_DEL: @@ -677,7 +681,7 @@ static void do_process_read(process_read_arg_t *arg) if (np > 0) { LOG_DEBUG("found at least a pair (%u)\n", np); - if(np == 2) { + if(np == 2) {//if(false) { // found at least 2 matching pairs of positions. Check the delta between the two pairs to // decide whether we should keep the best pair @@ -696,12 +700,16 @@ static void do_process_read(process_read_arg_t *arg) #else if(delta >= DIST_PAIR_THRESHOLD) { #endif - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, false); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, false); update = true; - }/* else { - LOG_WARN("unusable pair (%u)\n", result_tab[i].num/4); - }*/ + } else { + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, true); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, true); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[1], mapq, true); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[1], mapq, true); + // LOG_WARN("unusable pair (%u)\n", result_tab[i].num/4); + } } else if(np) { // only one result, take it int delta = abs((int)(MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]])) - (2 * (MAX_SUBSTITUTION + 1))); @@ -716,8 +724,8 @@ static void do_process_read(process_read_arg_t *arg) #else if(delta >= DIST_PAIR_THRESHOLD) { #endif - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, false); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, false); update = true; }/* else { LOG_WARN("unusable pair (%u)\n", result_tab[i].num/4); @@ -761,7 +769,7 @@ static void do_process_read(process_read_arg_t *arg) #else if(delta >= DIST_SINGLE_THRESHOLD) { #endif - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, false); update = true; } } @@ -780,7 +788,7 @@ static void do_process_read(process_read_arg_t *arg) #else if(delta >= DIST_SINGLE_THRESHOLD) { #endif - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, false); update = true; } } @@ -803,7 +811,7 @@ static void do_process_read(process_read_arg_t *arg) if(delta >= DIST_SINGLE_THRESHOLD) { #endif - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, false); update = true; } } @@ -822,7 +830,7 @@ static void do_process_read(process_read_arg_t *arg) #else if (delta >= DIST_SINGLE_THRESHOLD) { #endif - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq); + hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, false); update = true; } } diff --git a/host/src/vartree.c b/host/src/vartree.c index 2f679e6..a33d46f 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -488,9 +488,11 @@ void create_vcf() fclose(sub_file); #endif +#if DUMP_FREQUENCY_TABLE printf("dumping frequency table...\n"); dump_freq_table(ref_genome, frequency_table); printf("table done dumping; starting variant calling...\n"); +#endif unsigned int uncovered_nucleotides = 0; unsigned int badly_covered_nucleotides = 0; diff --git a/tests/read_freq_dump.py b/tests/read_freq_dump.py index f3038a2..eeddd9f 100755 --- a/tests/read_freq_dump.py +++ b/tests/read_freq_dump.py @@ -11,8 +11,6 @@ def __init__(self, file_name): self.mmap_freq_dump = mmap.mmap(self.file_object.fileno(), length=0, access=mmap.ACCESS_READ) self.n_seq = self.mmap_freq_dump.read_byte() self.seq_starts = [int.from_bytes(self.mmap_freq_dump.read(8), byteorder='little') for _ in range(self.n_seq)] - print(self.n_seq, self.seq_starts) - print([v%30 for v in self.seq_starts]) def read_address(self, sequence_id, index): self.mmap_freq_dump.seek(self.seq_starts[sequence_id] + index*30) @@ -20,6 +18,10 @@ def read_address(self, sequence_id, index): scores = [int.from_bytes(self.mmap_freq_dump.read(2), byteorder='little') for _ in range(5)] return freqs, scores + def get_string_from_address(self, seq_n, index): + freqs, scores = self.read_address(seq_n-1, index-1) + return " ".join(str(v) for v in [f'{seq_n: >2}', f'{index: >8}', "", " ".join(f'{v:7.2f}' for v in freqs), "\t", " ".join(f'{v: >4}' for v in scores)]) + def __del__(self): self.close() @@ -27,11 +29,6 @@ def close(self): self.mmap_freq_dump.close() self.file_object.close() -def dump_around(freq_dump_obj, seq_n, index, context): - for i in range(-context, context+1, 1): - freqs, scores = freq_dump_obj.read_address(seq_n-1, index-i) - #print(seq_n, index+i, " ".join(f'{v:.2f}' for v in freqs), " \t", " ".join(str(v) for v in scores)) - print(seq_n, index+i, "\t".join(f'{v:.2f}' for v in freqs), sep="\t") def main(args): context = args.context @@ -39,11 +36,37 @@ def main(args): freq_dump_obj = freq_dump(dump_file_name) for address_file_name in args.address_files: with open(address_file_name, "r") as address_file: - for line in address_file.readlines(): + focuses = [] + last_dump = (0,0) + for _ in range(context): + line = address_file.readline() + if line=="": + break split_line = line.split(" ") seq_n = int(split_line[0]) index = int(split_line[1]) - dump_around(freq_dump_obj, seq_n, index, context) + focuses.append((seq_n, index)) + while focuses!=[]: + line = address_file.readline() + if line!="": + split_line = line.split(" ") + seq_n = int(split_line[0]) + index = int(split_line[1]) + focuses.append((seq_n, index)) + seq_n, index = focuses[0] + start = index-context + end = index+context + if last_dump[0] == seq_n and last_dump[1]+1 >= start: + start = max(start, last_dump[1]+1) + else: + print() + for i in range(start, end+1): + if (seq_n, i) in focuses: + print("* "+freq_dump_obj.get_string_from_address(seq_n, i)) + else: + print(" "+freq_dump_obj.get_string_from_address(seq_n, i)) + last_dump = (seq_n, end) + focuses.pop(0) freq_dump_obj.close() From 7a09e4fd48ff7c34b3f97ae52829d9680b593de2 Mon Sep 17 00:00:00 2001 From: amoisson Date: Tue, 5 Apr 2022 15:04:18 +0200 Subject: [PATCH 33/48] added a variants codependence info for more context aware vc --- host/inc/debug.h | 24 ++++++--- host/inc/genome.h | 18 +++++++ host/inc/profiling.h | 28 ++++++++++- host/src/accumulateread.c | 4 ++ host/src/genome.c | 103 +++++++++++++++++++++++++++++++++++++- host/src/processread.c | 66 +++++++++++++++++------- host/src/upvc.c | 17 +------ host/src/vartree.c | 46 +++++++++++++++-- 8 files changed, 259 insertions(+), 47 deletions(-) diff --git a/host/inc/debug.h b/host/inc/debug.h index 4ab0fe1..6bf1dde 100644 --- a/host/inc/debug.h +++ b/host/inc/debug.h @@ -1,3 +1,7 @@ +#ifndef __DEBUG_H__ +#define __DEBUG_H__ +#include + #define V_QUIET 0 #define V_FATAL 1 #define V_ERROR 2 @@ -9,6 +13,7 @@ #define VERBOSE V_INFO #define VERBOSE_COLORS true #define VERBOSE_LOG_LEVEL true +#define VERBOSE_TIMESTAMP true #if VERBOSE_COLORS #define VERBOSE_COLOR_START_FATAL "\033[41m" @@ -34,39 +39,46 @@ #define VERBOSE_PRINT_PREFIX(level) #endif +#if VERBOSE_TIMESTAMP +#define VERBOSE_PRINT_TIMESTAMP() float t= (float) clock()/CLOCKS_PER_SEC; fprintf(stderr, "(%02d:%02d:%02.3f)", (int) (t/3600), (int) (t/60)%60, (float) ((int) (t*1000)%60000)/1000.); +#else +#define VERBOSE_PRINT_TIMESTAMP() +#endif #if VERBOSE>=V_TRACE -#define LOG_TRACE(...) fprintf(stderr, VERBOSE_PRINT_PREFIX(TRACE) __VA_ARGS__) +#define LOG_TRACE(string, ...) {VERBOSE_PRINT_TIMESTAMP() fprintf(stderr, VERBOSE_PRINT_PREFIX(TRACE) string, ## __VA_ARGS__);} #else #define LOG_TRACE(...) #endif #if VERBOSE>=V_DEBUG -#define LOG_DEBUG(...) fprintf(stderr, VERBOSE_PRINT_PREFIX(DEBUG) __VA_ARGS__) +#define LOG_DEBUG(string, ...) {VERBOSE_PRINT_TIMESTAMP() fprintf(stderr, VERBOSE_PRINT_PREFIX(DEBUG) string, ## __VA_ARGS__);} #else #define LOG_DEBUG(...) #endif #if VERBOSE>=V_INFO -#define LOG_INFO(...) fprintf(stderr, VERBOSE_PRINT_PREFIX(INFO) __VA_ARGS__) +#define LOG_INFO(string, ...) {VERBOSE_PRINT_TIMESTAMP() fprintf(stderr, VERBOSE_PRINT_PREFIX(INFO) string, ## __VA_ARGS__);} #else #define LOG_INFO(...) #endif #if VERBOSE>=V_WARN -#define LOG_WARN(...) fprintf(stderr, VERBOSE_PRINT_PREFIX(WARN) __VA_ARGS__) +#define LOG_WARN(string, ...) {VERBOSE_PRINT_TIMESTAMP() fprintf(stderr, VERBOSE_PRINT_PREFIX(WARN) string, ## __VA_ARGS__);} #else #define LOG_WARN(...) #endif #if VERBOSE>=V_ERROR -#define LOG_ERROR(...) fprintf(stderr, VERBOSE_PRINT_PREFIX(ERROR) __VA_ARGS__) +#define LOG_ERROR(string, ...) {VERBOSE_PRINT_TIMESTAMP() fprintf(stderr, VERBOSE_PRINT_PREFIX(ERROR) string, ## __VA_ARGS__);} #else #define LOG_ERROR(...) #endif #if VERBOSE>=V_FATAL -#define LOG_FATAL(...) fprintf(stderr, VERBOSE_PRINT_PREFIX(FATAL) __VA_ARGS__) +#define LOG_FATAL(string, ...) {VERBOSE_PRINT_TIMESTAMP() fprintf(stderr, VERBOSE_PRINT_PREFIX(FATAL) string, ## __VA_ARGS__);} #else #define LOG_FATAL(...) #endif + +#endif // __DEBUG_H__ diff --git a/host/inc/genome.h b/host/inc/genome.h index 7abf27d..4709b7d 100644 --- a/host/inc/genome.h +++ b/host/inc/genome.h @@ -6,6 +6,7 @@ #define __GENOME_H__ #include +#include #define MAX_SEQ_GEN (24) // max number of chromosomes #define MAX_SEQ_NAME_SIZE (8) @@ -36,7 +37,24 @@ struct frequency_info { unsigned int score; unsigned int unsure_score; }; + +#pragma pack(push,1) +struct variants_codependence_info { + uint16_t key; + uint8_t codependence_count; +}; + +#define COD_LIST_SIZE 4 +struct variants_codependence_info_list { + struct variants_codependence_info_list* next_list; + struct variants_codependence_info content[COD_LIST_SIZE]; +}; +#pragma pack(pop) + +void add_codependence_info(struct variants_codependence_info_list** next_variants_info_list, int16_t other_index_delta, uint8_t current_letter, uint8_t other_letter, unsigned int genome_size, pthread_mutex_t* mutex); + struct frequency_info** get_frequency_table(); +struct variants_codependence_info_list** get_codependence_table(pthread_mutex_t* mutex); void free_frequency_table(); #endif /* __GENOME_H__ */ diff --git a/host/inc/profiling.h b/host/inc/profiling.h index c5098ca..e30b305 100644 --- a/host/inc/profiling.h +++ b/host/inc/profiling.h @@ -11,7 +11,7 @@ struct time_stat_t { clock_t substep_total_times[STAT_MAX_SUBSTEPS]; }; -struct time_stat_t profiling[15]; +struct time_stat_t profiling[25]; #define STAT_SUB_ONLY_PATH 0 #define STAT_DPD 1 @@ -28,6 +28,9 @@ struct time_stat_t profiling[15]; #define STAT_THREAD_ACC 12 #define STAT_THREAD_PROCESS 13 #define STAT_DO_MAPPING 14 +#define STAT_ADD_CODEPENDENCE_INFO 15 +#define STAT_GET_NEW_CODEPENDENCE_INFO 16 +#define STAT_ALLOCATE_NEW_CHUNK 17 #define STAT_RECORD_START(FUNCTION) \ clock_t profiling_step_time, profiling_last_step_time; \ @@ -61,11 +64,32 @@ struct time_stat_t profiling[15]; for (int i=0; i 0) { \ - printf("\t\t%d:\t", i); \ + printf("\t\t%d:\t", i); \ PRINT_MICROSECONDS(profiling[FUNCTION].substep_total_times[i]) \ printf("\n"); \ } \ } + +#define PRINT_ALL_FUNCTION_STAT() \ + printf("\nprofiling:\n\n"); \ + PRINT_FUNCTION_STAT(STAT_DO_MAPPING); \ + PRINT_FUNCTION_STAT(STAT_THREAD_PROCESS); \ + PRINT_FUNCTION_STAT(STAT_THREAD_ACC); \ + PRINT_FUNCTION_STAT(STAT_THREAD_DISPATCH); \ + PRINT_FUNCTION_STAT(STAT_THREAD_GET_READS); \ + PRINT_FUNCTION_STAT(STAT_EXEC_DPUS); \ + PRINT_FUNCTION_STAT(STAT_EXEC_ROUND); \ + PRINT_FUNCTION_STAT(STAT_PROCESS_READ); \ + PRINT_FUNCTION_STAT(STAT_DO_PROCESS_READ); \ + PRINT_FUNCTION_STAT(STAT_ADD_TO_NON_MAPPED_READ); \ + PRINT_FUNCTION_STAT(STAT_UPDATE_FREQUENCY_TABLE); \ + PRINT_FUNCTION_STAT(STAT_GET_READ_UPDATE_POSITIONS); \ + PRINT_FUNCTION_STAT(STAT_CODE_ALIGNMENT); \ + PRINT_FUNCTION_STAT(STAT_SUB_ONLY_PATH); \ + PRINT_FUNCTION_STAT(STAT_DPD); \ + PRINT_FUNCTION_STAT(STAT_ADD_CODEPENDENCE_INFO); \ + PRINT_FUNCTION_STAT(STAT_GET_NEW_CODEPENDENCE_INFO); \ + PRINT_FUNCTION_STAT(STAT_ALLOCATE_NEW_CHUNK); #endif /* __PROFILING_H__ */ diff --git a/host/src/accumulateread.c b/host/src/accumulateread.c index 615e1c6..6748aa5 100644 --- a/host/src/accumulateread.c +++ b/host/src/accumulateread.c @@ -6,6 +6,8 @@ #include "common.h" #include "index.h" #include "upvc.h" +#include "profiling.h" +#include "debug.h" #include #include @@ -190,6 +192,7 @@ acc_results_t accumulate_get_result(unsigned int pass_id, bool free_results) void accumulate_read(unsigned int pass_id, unsigned int dpu_offset) { printf("DPU_OFFSET: %u - PASS_ID: %u\n", dpu_offset, pass_id); + PRINT_ALL_FUNCTION_STAT(); nb_dpus_used_current_run = MIN(index_get_nb_dpu() - dpu_offset, nb_dpus_per_run); acc_res = RESULTS_BUFFERS(pass_id); @@ -241,6 +244,7 @@ void accumulate_read(unsigned int pass_id, unsigned int dpu_offset) size_t size = sizeof(dpu_result_out_t) * (nb_read + 1); // Alloc the merged and sorted tab of result for this pass + LOG_INFO("allocating %lu for dpu results\n", size); dpu_result_out_t *merged_result_tab = malloc(size); assert(merged_result_tab != NULL); diff --git a/host/src/genome.c b/host/src/genome.c index 238f277..3ac8f99 100644 --- a/host/src/genome.c +++ b/host/src/genome.c @@ -7,10 +7,13 @@ #include #include #include +#include #include "genome.h" #include "parse_args.h" #include "upvc.h" +#include "debug.h" +#include "profiling.h" #define MAX_BUF_SIZE (1024) #define GENOME_BINARY "genome.bin" @@ -121,26 +124,124 @@ void genome_free() * Each entry is a table of the size of the reference genome **/ static struct frequency_info* frequency_table[5]; +static struct variants_codependence_info_list** variant_codependences; +#define NB_CODEPENDENCE_CHUNK_DIV (1<<10) +struct codependence_chunk* last_allocated_codependence_chunk[NB_CODEPENDENCE_CHUNK_DIV]; +unsigned int allocated_chunks = 0;//for debug purposes (TODO: remove) static bool init_frequency_table = false; +static bool init_codependence_table = false; + +#define ALLOC_CHUNK_SIZE 100 +struct codependence_chunk { + struct codependence_chunk* previous_chunk; + struct codependence_chunk* next_chunk; + unsigned int next_slot_free; + struct variants_codependence_info_list codependence_info_lists[ALLOC_CHUNK_SIZE]; +}; + +static void allocate_new_codependence_chunk(int i) { + STAT_RECORD_START(STAT_ALLOCATE_NEW_CHUNK); + struct codependence_chunk* new_chunk = calloc(1, sizeof(struct codependence_chunk)); + new_chunk->previous_chunk = last_allocated_codependence_chunk[i]; + new_chunk->next_slot_free = 0; + last_allocated_codependence_chunk[i] = new_chunk; + LOG_INFO("size of pointer: %d\n", (int) sizeof(void*)); + LOG_INFO("size of co-info: %d\n", (int) sizeof(struct variants_codependence_info)); + LOG_INFO("size of co-info-list: %d\n", (int) sizeof(struct variants_codependence_info_list)); + LOG_INFO("size of chunk: %d\n", (int) sizeof(struct codependence_chunk)); + LOG_INFO("allocating new codependence chunk (%d)\n", ++allocated_chunks); + STAT_RECORD_LAST_STEP(STAT_ALLOCATE_NEW_CHUNK, 0); +} + +static struct variants_codependence_info_list* get_new_codependence_info_list(int i) { + STAT_RECORD_START(STAT_GET_NEW_CODEPENDENCE_INFO); + if (last_allocated_codependence_chunk[i]->next_slot_free >= ALLOC_CHUNK_SIZE) { + allocate_new_codependence_chunk(i); + } + STAT_RECORD_LAST_STEP(STAT_GET_NEW_CODEPENDENCE_INFO, 0); + return &(last_allocated_codependence_chunk[i]->codependence_info_lists[(last_allocated_codependence_chunk[i]->next_slot_free)++]); +} + +void add_codependence_info(struct variants_codependence_info_list** next_variants_info_list, int16_t other_index_delta, uint8_t current_letter, uint8_t other_letter, unsigned int genome_size, pthread_mutex_t* mutex) +{ + STAT_RECORD_START(STAT_ADD_CODEPENDENCE_INFO); + //struct variants_codependence_info_list** next_variants_info_list = &(freq_info->variants_codependence_list); + struct variants_codependence_info_list* last_variants_info_list = NULL; + uint16_t key = (uint16_t) ((other_index_delta<<4) | ((other_letter&0x3)<<2) | (current_letter&0x3)); + for (;*next_variants_info_list != NULL; next_variants_info_list = &((*next_variants_info_list)->next_list)) { + for (int i=0; icontent[i].key == key) { + if ((*next_variants_info_list)->content[i].codependence_count < UINT8_MAX) + (*next_variants_info_list)->content[i].codependence_count++; + STAT_RECORD_LAST_STEP(STAT_ADD_CODEPENDENCE_INFO, 0); + return; + } + } + last_variants_info_list = *next_variants_info_list; + } + if (last_variants_info_list != NULL) { + for (int i=0; icontent[i].key == 0) { + last_variants_info_list->content[i].key = key; + last_variants_info_list->content[i].codependence_count = 1; + STAT_RECORD_LAST_STEP(STAT_ADD_CODEPENDENCE_INFO, 1); + return; + } + } + } + LOG_DEBUG("getting new codependence_info for : ? -> %010d (char:%d)\n", other_index_delta, other_letter); + int chunk_div_index = (other_index_delta*NB_CODEPENDENCE_CHUNK_DIV)/genome_size; + STAT_RECORD_STEP(STAT_ADD_CODEPENDENCE_INFO, 2); + pthread_mutex_lock(mutex); + STAT_RECORD_STEP(STAT_ADD_CODEPENDENCE_INFO, 3); + struct variants_codependence_info_list* new_info_list = get_new_codependence_info_list(chunk_div_index); + pthread_mutex_unlock(mutex); + new_info_list->next_list = NULL; + new_info_list->content[0].key = key; + new_info_list->content[0].codependence_count = 1; + *next_variants_info_list = new_info_list; + STAT_RECORD_LAST_STEP(STAT_ADD_CODEPENDENCE_INFO, 4); +} struct frequency_info** get_frequency_table() { if(!init_frequency_table) { + init_frequency_table = true; // allocate frequency_table on first call for(int i = 0; i < 5; ++i) { frequency_table[i] = (struct frequency_info*)calloc(genome.fasta_file_size, sizeof(struct frequency_info)); } - init_frequency_table = true; } return frequency_table; } +struct variants_codependence_info_list** get_codependence_table(pthread_mutex_t* mutex) { + if(!init_codependence_table) { + pthread_mutex_lock(mutex); + init_codependence_table = true; + variant_codependences = (struct variants_codependence_info_list**)calloc(genome.fasta_file_size, sizeof(void*)); + for (int i=0; iprevious_chunk; + for (;last_allocated_codependence_chunk[i] != NULL; to_delete_next = last_allocated_codependence_chunk[i]->previous_chunk) { + free(last_allocated_codependence_chunk[i]); + last_allocated_codependence_chunk[i] = to_delete_next; + } + } init_frequency_table = false; } } diff --git a/host/src/processread.c b/host/src/processread.c index 1c8cb8a..2d59d32 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -40,7 +40,7 @@ #endif #define MAX_SCORE_DIFFERENCE_WITH_BEST 40 -#define MAX_CONSIDERED_MAPPINGS 1000 +#define MAX_CONSIDERED_MAPPINGS 4000 /* static void log_nucleotides(int8_t *s, int max_len) { @@ -364,6 +364,7 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, backtrack_t ** backtrack #define NB_FREQ_TABLE_MUTEXES (1<<10) static pthread_mutex_t freq_table_mutexes[NB_FREQ_TABLE_MUTEXES+1]; //static pthread_mutex_t print_mutex; +static pthread_mutex_t codependence_list_mutex; bool update_frequency_table( genome_t *ref_genome, @@ -377,6 +378,7 @@ bool update_frequency_table( { STAT_RECORD_START(STAT_UPDATE_FREQUENCY_TABLE); struct frequency_info ** frequency_table = get_frequency_table(); + struct variants_codependence_info_list** codependence_table = get_codependence_table(&codependence_list_mutex); uint64_t genome_pos = ref_genome->pt_seq[result_tab[pos].coord.seq_nr] + result_tab[pos].coord.seed_nr; int num = result_tab[pos].num; int8_t *read = reads_buffer + (num * SIZE_READ); @@ -438,6 +440,9 @@ bool update_frequency_table( bool has_indel = false; uint8_t read_letter; //printf("genome_pos=%lu\n", genome_pos); + unsigned int variants_found = 0; + unsigned int variants_found_positions[SIZE_READ]; + uint8_t variants_found_letters[SIZE_READ]; for (; backtrack_end > backtrack; backtrack_end--) { unsigned int current_position = genome_pos+backtrack_end->ix; //printf("backtrack_end->ix=%u; current_position=%u\n", backtrack_end->ix, current_position); @@ -445,25 +450,51 @@ bool update_frequency_table( continue; } switch (backtrack_end->type) { - case 0: case CODE_SUB: - read_letter = read[backtrack_end->jx]; - if (read_letter > 3) { - read_letter = read_letter>>1 & 0x3; - } - frequency_table[read_letter][current_position].freq += mapq * read_quality[invert_read ? SIZE_READ-backtrack_end->jx-1 : backtrack_end->jx]; - frequency_table[read_letter][current_position].score++; - if (unsure) { - frequency_table[read_letter][current_position].unsure_score++; - } - break; + read_letter = read[backtrack_end->jx]; + if (read_letter > 3) { + read_letter = read_letter>>1 & 0x3; + } + + // update codependence info + for (unsigned int i=0; ifasta_file_size, &codependence_list_mutex); + add_codependence_info(&codependence_table[variants_found_positions[i]], (int16_t) (current_position-variants_found_positions[i]), variants_found_letters[i], read_letter, ref_genome->fasta_file_size, &codependence_list_mutex); + //pthread_mutex_unlock(&codependence_list_mutex); + } + variants_found_positions[variants_found] = current_position; + variants_found_letters[variants_found] = read_letter; + variants_found++; + + frequency_table[read_letter][current_position].freq += mapq * read_quality[invert_read ? SIZE_READ-backtrack_end->jx-1 : backtrack_end->jx]; + frequency_table[read_letter][current_position].score++; + + + if (unsure) { + frequency_table[read_letter][current_position].unsure_score++; + } + break; + case 0: + read_letter = read[backtrack_end->jx]; + if (read_letter > 3) { + read_letter = read_letter>>1 & 0x3; + } + frequency_table[read_letter][current_position].freq += mapq * read_quality[invert_read ? SIZE_READ-backtrack_end->jx-1 : backtrack_end->jx]; + frequency_table[read_letter][current_position].score++; + + + if (unsure) { + frequency_table[read_letter][current_position].unsure_score++; + } + break; case CODE_INS: case CODE_DEL: - has_indel = true; - //LOG_WARN("unhandled indel\n"); - // TODO: handle indels - break; - // TODO: handle errors even if they should never happen + has_indel = true; + //LOG_WARN("unhandled indel\n"); + // TODO: handle indels + break; + // TODO: handle errors even if they should never happen } } STAT_RECORD_STEP(STAT_UPDATE_FREQUENCY_TABLE, 4); @@ -936,6 +967,7 @@ void process_read_free() for (int i=0; idata[genome_pos]) continue; // not a variant if the same nucleotide as in reference genome // if frequency and depth pass the threshold, consider it a variant - if(score > reverse_filter(total)) { + if(freq > reverse_filter(total)) { // this is a substitution, create variant variant_t *var = (variant_t *)malloc(sizeof(variant_t)); @@ -406,6 +406,22 @@ static variant_t ** get_most_frequent_variant(genome_t * ref_genome, struct freq return results; } +static void add_codependence_to_freq_table(struct frequency_info** frequency_table, struct variants_codependence_info_list* codependence_list, uint64_t position, uint8_t letter, float freq_update) { + for (;codependence_list != NULL; codependence_list=codependence_list->next_list) { + for (int i = 0; icontent[i].key != 0; i++) { + uint8_t current_letter = codependence_list->content[i].key & 0x3; + if (current_letter == letter) { + uint8_t other_letter = (codependence_list->content[i].key >> 2) & 0x3; + uint64_t position_delta = codependence_list->content[i].key >> 4; + frequency_table[other_letter][position + position_delta].freq += (codependence_list->content[i].codependence_count) * freq_update; + } + } + } +} + +#define POSITIVE_COD_INFLUENCE 0.02 +#define NEGATIVE_COD_INFLUENCE -0.2 + //TODO here read frequency table and write vcf (take max of frequency table to find substitution if any) void create_vcf() { @@ -444,6 +460,7 @@ void create_vcf() /* ####### END OF HEADER ####### */ struct frequency_info **frequency_table = get_frequency_table(); + struct variants_codependence_info_list** codependence_table = get_codependence_table(NULL);// giving a NULL pointer should only work if the table is already allocated; which should be the case uint32_t nb_pos_multiple_var = 0; /** @@ -502,6 +519,25 @@ void create_vcf() unsigned int position_most_coverage = 0; unsigned int chromosome_most_coverage = 99999; unsigned int total_coverage = 0; + // First pass on the frequency table to take into account variant codependence + LOG_INFO("doing first pass of vc\n"); + /* for each sequence in the genome */ + for (uint32_t seq_number = 0; seq_number < ref_genome->nb_seq; seq_number++) { + /* for each position in the sequence */ + for (uint64_t seq_position = 0; seq_position < ref_genome->len_seq[seq_number]; seq_position++) { + variant_t ** results = get_most_frequent_variant(ref_genome, frequency_table, seq_number, seq_position); + for (uint8_t i=0; i<4; i++) { + uint64_t genome_pos = ref_genome->pt_seq[seq_number] + seq_position; + if (results[i]) { + add_codependence_to_freq_table(frequency_table, codependence_table[genome_pos], genome_pos, i, POSITIVE_COD_INFLUENCE); + } else if (frequency_table[i][genome_pos].score>0) { + add_codependence_to_freq_table(frequency_table, codependence_table[genome_pos], genome_pos, i, NEGATIVE_COD_INFLUENCE); + } + } + } + } + + LOG_INFO("doing second and final pass of vc\n"); /* for each sequence in the genome */ for (uint32_t seq_number = 0; seq_number < ref_genome->nb_seq; seq_number++) { /* for each position in the sequence */ From 64e38bb5cba367fdfccd7750dac0e3b4eda2e488 Mon Sep 17 00:00:00 2001 From: amoisson Date: Mon, 18 Apr 2022 15:04:36 +0200 Subject: [PATCH 34/48] fixed deallocation --- host/inc/genome.h | 3 ++- host/src/genome.c | 46 +++++++++++++++++++++------------ host/src/processread.c | 4 +-- host/src/vartree.c | 58 ++++++++++++++++++++++++------------------ 4 files changed, 67 insertions(+), 44 deletions(-) diff --git a/host/inc/genome.h b/host/inc/genome.h index 4709b7d..7fe6b04 100644 --- a/host/inc/genome.h +++ b/host/inc/genome.h @@ -40,7 +40,7 @@ struct frequency_info { #pragma pack(push,1) struct variants_codependence_info { - uint16_t key; + int16_t key; uint8_t codependence_count; }; @@ -56,5 +56,6 @@ void add_codependence_info(struct variants_codependence_info_list** next_variant struct frequency_info** get_frequency_table(); struct variants_codependence_info_list** get_codependence_table(pthread_mutex_t* mutex); void free_frequency_table(); +void free_codependence_chunks(); #endif /* __GENOME_H__ */ diff --git a/host/src/genome.c b/host/src/genome.c index 3ac8f99..d98c056 100644 --- a/host/src/genome.c +++ b/host/src/genome.c @@ -42,6 +42,7 @@ void genome_load() && "Wrong header, make sure you have generated your MRAMs with the same version of UPVC that you are using."); assert(genome.version == GENOME_VERSION && "Could not load a genome generated with a different version of UPVC."); + LOG_INFO("allocating genome data (%luMB + %luMB)\n", sizeof(int8_t) * genome.fasta_file_size/1000000, sizeof(int32_t) * genome.fasta_file_size/1000000); genome.data = (int8_t *)malloc(sizeof(int8_t) * genome.fasta_file_size); genome.mapping_coverage = (int32_t *)calloc(sizeof(int32_t), genome.fasta_file_size); assert(genome.data != NULL && genome.mapping_coverage != NULL); @@ -72,6 +73,7 @@ void genome_create() genome.fasta_file_size = ftell(genome_file); rewind(genome_file); + LOG_INFO("allocating genome data (%luMB)\n", sizeof(int8_t) * genome.fasta_file_size/1000000); genome.data = (int8_t *)malloc(sizeof(int8_t) * genome.fasta_file_size); assert(genome.data != NULL); genome.nb_seq = 0; @@ -141,15 +143,16 @@ struct codependence_chunk { static void allocate_new_codependence_chunk(int i) { STAT_RECORD_START(STAT_ALLOCATE_NEW_CHUNK); + // LOG_INFO("size of pointer: %d\n", (int) sizeof(void*)); + // LOG_INFO("size of co-info: %d\n", (int) sizeof(struct variants_codependence_info)); + // LOG_INFO("size of co-info-list: %d\n", (int) sizeof(struct variants_codependence_info_list)); + LOG_INFO("size of chunk: %d\n", (int) sizeof(struct codependence_chunk)); + LOG_INFO("allocating new codependence chunk (%d)\n", ++allocated_chunks); struct codependence_chunk* new_chunk = calloc(1, sizeof(struct codependence_chunk)); + assert(new_chunk != NULL); new_chunk->previous_chunk = last_allocated_codependence_chunk[i]; new_chunk->next_slot_free = 0; last_allocated_codependence_chunk[i] = new_chunk; - LOG_INFO("size of pointer: %d\n", (int) sizeof(void*)); - LOG_INFO("size of co-info: %d\n", (int) sizeof(struct variants_codependence_info)); - LOG_INFO("size of co-info-list: %d\n", (int) sizeof(struct variants_codependence_info_list)); - LOG_INFO("size of chunk: %d\n", (int) sizeof(struct codependence_chunk)); - LOG_INFO("allocating new codependence chunk (%d)\n", ++allocated_chunks); STAT_RECORD_LAST_STEP(STAT_ALLOCATE_NEW_CHUNK, 0); } @@ -167,7 +170,7 @@ void add_codependence_info(struct variants_codependence_info_list** next_variant STAT_RECORD_START(STAT_ADD_CODEPENDENCE_INFO); //struct variants_codependence_info_list** next_variants_info_list = &(freq_info->variants_codependence_list); struct variants_codependence_info_list* last_variants_info_list = NULL; - uint16_t key = (uint16_t) ((other_index_delta<<4) | ((other_letter&0x3)<<2) | (current_letter&0x3)); + int16_t key = (int16_t) ((other_index_delta<<4) | ((other_letter&0x3)<<2) | (current_letter&0x3)); for (;*next_variants_info_list != NULL; next_variants_info_list = &((*next_variants_info_list)->next_list)) { for (int i=0; icontent[i].key == key) { @@ -207,9 +210,11 @@ struct frequency_info** get_frequency_table() { if(!init_frequency_table) { init_frequency_table = true; + LOG_INFO("allocating frequency_table (5x%luMB)\n", sizeof(struct frequency_info)*genome.fasta_file_size/1000000); // allocate frequency_table on first call for(int i = 0; i < 5; ++i) { frequency_table[i] = (struct frequency_info*)calloc(genome.fasta_file_size, sizeof(struct frequency_info)); + assert(frequency_table[i] != NULL); } } return frequency_table; @@ -218,11 +223,15 @@ struct frequency_info** get_frequency_table() { struct variants_codependence_info_list** get_codependence_table(pthread_mutex_t* mutex) { if(!init_codependence_table) { pthread_mutex_lock(mutex); - init_codependence_table = true; - variant_codependences = (struct variants_codependence_info_list**)calloc(genome.fasta_file_size, sizeof(void*)); - for (int i=0; iprevious_chunk; - for (;last_allocated_codependence_chunk[i] != NULL; to_delete_next = last_allocated_codependence_chunk[i]->previous_chunk) { + struct codependence_chunk* to_delete_next = last_allocated_codependence_chunk[i]; + for (;last_allocated_codependence_chunk[i] != NULL; last_allocated_codependence_chunk[i] = to_delete_next) { + to_delete_next = last_allocated_codependence_chunk[i]->previous_chunk; free(last_allocated_codependence_chunk[i]); - last_allocated_codependence_chunk[i] = to_delete_next; } } - init_frequency_table = false; + init_codependence_table = false; } } - - diff --git a/host/src/processread.c b/host/src/processread.c index 2d59d32..c605307 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -21,7 +21,7 @@ #include "parse_args.h" #include "profiling.h" -#define DEBUG_READ_MAPPING true +#define DEBUG_READ_MAPPING false #define SIZE_INSERT_MEAN (400) #define SIZE_INSERT_STD (3 * 50) @@ -475,7 +475,7 @@ bool update_frequency_table( frequency_table[read_letter][current_position].unsure_score++; } break; - case 0: + case 0: // FIXME : CODE_MATCH read_letter = read[backtrack_end->jx]; if (read_letter > 3) { read_letter = read_letter>>1 & 0x3; diff --git a/host/src/vartree.c b/host/src/vartree.c index 73785f4..8528b76 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -1,4 +1,4 @@ - /** +/** * Copyright 2016-2019 - Dominique Lavenier & UPMEM */ @@ -96,6 +96,7 @@ void variant_tree_init() genome_t *genome = genome_get(); pthread_mutex_init(&mutex, NULL); for (unsigned int each_seq = 0; each_seq < genome->nb_seq; each_seq++) { + LOG_INFO("allocating variant_list (%luMB)\n", sizeof(variant_t*) * genome->len_seq[each_seq]); variant_list[each_seq] = (variant_t **)calloc(genome->len_seq[each_seq], sizeof(variant_t *)); } } @@ -367,43 +368,46 @@ float reverse_filter(uint32_t score) { FILE * dbg_file = NULL; FILE * sub_file = NULL; -static variant_t ** get_most_frequent_variant(genome_t * ref_genome, struct frequency_info ** frequency_table, uint32_t seq_number, uint64_t seq_position) { +static void get_most_frequent_variant(genome_t * ref_genome, struct frequency_info ** frequency_table, uint32_t seq_number, uint64_t seq_position, variant_t * results) { - static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; + static char nucleotide[4] = { 'A', 'C', 'T', 'G' };// FIXME : const uint64_t genome_pos = ref_genome->pt_seq[seq_number] + seq_position; - variant_t** results = calloc(5, sizeof(variant_t*)); float total = 0; for(int i = 0; i < 5; ++i) { total += frequency_table[i][genome_pos].freq; + results[i].depth = 0; + results[i].score = 0; } if(total == 0) - return results; + return ;//results; for(int i = 0; i < 5; ++i) { float freq = frequency_table[i][genome_pos].freq; //uint32_t score = frequency_table[i][genome_pos].score; - if(i == ref_genome->data[genome_pos]) + if(i == ref_genome->data[genome_pos]) { continue; // not a variant if the same nucleotide as in reference genome + } // if frequency and depth pass the threshold, consider it a variant if(freq > reverse_filter(total)) { // this is a substitution, create variant - variant_t *var = (variant_t *)malloc(sizeof(variant_t)); - var->score = frequency_table[i][genome_pos].score; - var->depth = frequency_table[i][genome_pos].score; - var->ref[0] = nucleotide[ref_genome->data[genome_pos]]; - var->ref[1] = '\0'; - var->alt[0] = nucleotide[i]; - var->alt[1] = '\0'; - results[i] = var; + results[i].score = frequency_table[i][genome_pos].score; + results[i].depth = frequency_table[i][genome_pos].score; + results[i].ref[0] = nucleotide[ref_genome->data[genome_pos]]; + results[i].ref[1] = '\0'; + results[i].alt[0] = nucleotide[i]; + results[i].alt[1] = '\0'; + } else { + results[i].score = 0; + results[i].depth = 0; } } //printf("get_most_frequent_variant: genome_pos %lu, nucleotide max freq %d %f %c\n", genome_pos, nucId, max, nucId >= 0 ? nucleotide[nucId] : '-'); - return results; + // return results; } static void add_codependence_to_freq_table(struct frequency_info** frequency_table, struct variants_codependence_info_list* codependence_list, uint64_t position, uint8_t letter, float freq_update) { @@ -412,15 +416,15 @@ static void add_codependence_to_freq_table(struct frequency_info** frequency_tab uint8_t current_letter = codependence_list->content[i].key & 0x3; if (current_letter == letter) { uint8_t other_letter = (codependence_list->content[i].key >> 2) & 0x3; - uint64_t position_delta = codependence_list->content[i].key >> 4; + int64_t position_delta = codependence_list->content[i].key >> 4; frequency_table[other_letter][position + position_delta].freq += (codependence_list->content[i].codependence_count) * freq_update; } } } } -#define POSITIVE_COD_INFLUENCE 0.02 -#define NEGATIVE_COD_INFLUENCE -0.2 +#define POSITIVE_COD_INFLUENCE 0.00609352 +#define NEGATIVE_COD_INFLUENCE -0.129805 //TODO here read frequency table and write vcf (take max of frequency table to find substitution if any) void create_vcf() @@ -519,16 +523,18 @@ void create_vcf() unsigned int position_most_coverage = 0; unsigned int chromosome_most_coverage = 99999; unsigned int total_coverage = 0; + variant_t variants_to_call[5]; // First pass on the frequency table to take into account variant codependence LOG_INFO("doing first pass of vc\n"); /* for each sequence in the genome */ for (uint32_t seq_number = 0; seq_number < ref_genome->nb_seq; seq_number++) { /* for each position in the sequence */ + LOG_INFO("sequence %u\n", seq_number); for (uint64_t seq_position = 0; seq_position < ref_genome->len_seq[seq_number]; seq_position++) { - variant_t ** results = get_most_frequent_variant(ref_genome, frequency_table, seq_number, seq_position); + get_most_frequent_variant(ref_genome, frequency_table, seq_number, seq_position, variants_to_call); for (uint8_t i=0; i<4; i++) { uint64_t genome_pos = ref_genome->pt_seq[seq_number] + seq_position; - if (results[i]) { + if (variants_to_call[i].depth) { add_codependence_to_freq_table(frequency_table, codependence_table[genome_pos], genome_pos, i, POSITIVE_COD_INFLUENCE); } else if (frequency_table[i][genome_pos].score>0) { add_codependence_to_freq_table(frequency_table, codependence_table[genome_pos], genome_pos, i, NEGATIVE_COD_INFLUENCE); @@ -537,13 +543,16 @@ void create_vcf() } } + free_codependence_chunks(); + LOG_INFO("doing second and final pass of vc\n"); /* for each sequence in the genome */ for (uint32_t seq_number = 0; seq_number < ref_genome->nb_seq; seq_number++) { /* for each position in the sequence */ + LOG_INFO("sequence %u\n", seq_number); for (uint64_t seq_position = 0; seq_position < ref_genome->len_seq[seq_number]; seq_position++) { - variant_t ** results = get_most_frequent_variant(ref_genome, frequency_table, seq_number, seq_position); + get_most_frequent_variant(ref_genome, frequency_table, seq_number, seq_position, variants_to_call); unsigned int total_score = 0; total_score += frequency_table[0][ref_genome->pt_seq[seq_number] + seq_position].score; total_score += frequency_table[1][ref_genome->pt_seq[seq_number] + seq_position].score; @@ -568,16 +577,15 @@ void create_vcf() } int nb_var = 0; for(int i = 0; i < 5; ++i) { - variant_t * var = results[i]; - if(var) { + variant_t * var = &variants_to_call[i]; + if(var->depth) { + // LOG_DEBUG("calling variant %d at %u:%lu, freq:%f\n", i, seq_number, seq_position, frequency_table[i][ref_genome->pt_seq[seq_number] + seq_position].freq); nb_variant += print_variant_tree(var, seq_number, seq_position, ref_genome, vcf_file) ? 1 : 0; - free(var); nb_var++; } if(nb_var > 1) nb_pos_multiple_var++; } - free(results); } } From 595c3f3c152345564c91d215aee2dce7c4001f4b Mon Sep 17 00:00:00 2001 From: amoisson Date: Tue, 26 Apr 2022 15:07:44 +0200 Subject: [PATCH 35/48] better constants --- dpu/src/task.c | 4 ++-- host/src/vartree.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dpu/src/task.c b/dpu/src/task.c index dab0f99..dd53a12 100644 --- a/dpu/src/task.c +++ b/dpu/src/task.c @@ -56,9 +56,9 @@ __host dpu_compute_time_t DPU_COMPUTE_TIME_VAR; * @brief Maximum score allowed. */ #if SIZE_READ>120 -#define MAX_SCORE 128 +#define MAX_SCORE 80 #else -#define MAX_SCORE 86 +#define MAX_SCORE 80 #endif /** diff --git a/host/src/vartree.c b/host/src/vartree.c index 8528b76..ba38e0e 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -358,8 +358,8 @@ __attribute__((unused)) uint32_t depth_filter_fixed_3_f15(float freq) { return 3; } -#define AFFINE_B 1.12795388963028 -#define AFFINE_A 0.090970032203921 +#define AFFINE_B 1.37459 +#define AFFINE_A 0.0646469 float reverse_filter(uint32_t score) { return AFFINE_A*(float)score + AFFINE_B; @@ -423,8 +423,8 @@ static void add_codependence_to_freq_table(struct frequency_info** frequency_tab } } -#define POSITIVE_COD_INFLUENCE 0.00609352 -#define NEGATIVE_COD_INFLUENCE -0.129805 +#define POSITIVE_COD_INFLUENCE 0.00687069 +#define NEGATIVE_COD_INFLUENCE -0.236244 //TODO here read frequency table and write vcf (take max of frequency table to find substitution if any) void create_vcf() From 365bb7738ad1d62fbaf52d11e511c88a61507b67 Mon Sep 17 00:00:00 2001 From: amoisson Date: Thu, 5 May 2022 10:09:42 +0200 Subject: [PATCH 36/48] fixed mapping file for chromosomes beyond chr1; now using single mapping --- host/src/processread.c | 6 +++--- host/src/vartree.c | 21 +++++++++++++-------- tests/igvlike-focus.py | 4 +++- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/host/src/processread.c b/host/src/processread.c index c605307..1f1f994 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -424,7 +424,7 @@ bool update_frequency_table( STAT_RECORD_STEP(STAT_UPDATE_FREQUENCY_TABLE, 1); #if DEBUG_READ_MAPPING - write_read_mapping_from_backtrack(ref_genome->seq_name[result_tab[pos].coord.seq_nr], genome_pos, backtrack_end, read, num); + write_read_mapping_from_backtrack(ref_genome->seq_name[result_tab[pos].coord.seq_nr], result_tab[pos].coord.seed_nr, backtrack_end, read, num); #endif STAT_RECORD_STEP(STAT_UPDATE_FREQUENCY_TABLE, 2); @@ -762,8 +762,8 @@ static void do_process_read(process_read_arg_t *arg) LOG_WARN("unusable pair (%u)\n", result_tab[i].num/4); }*/ } - } - if (true) { + } else { + //if (true) { // check mapping of R1 and R2 independently unsigned int best_score_R1[2] = { INVALID_SCORE, INVALID_SCORE }; diff --git a/host/src/vartree.c b/host/src/vartree.c index ba38e0e..2a080af 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -2,6 +2,7 @@ * Copyright 2016-2019 - Dominique Lavenier & UPMEM */ +#include #include #include #include @@ -522,7 +523,8 @@ void create_vcf() unsigned int max_coverage = 0; unsigned int position_most_coverage = 0; unsigned int chromosome_most_coverage = 99999; - unsigned int total_coverage = 0; + uint64_t total_coverage = 0; + uint64_t total_cov_squared = 0; variant_t variants_to_call[5]; // First pass on the frequency table to take into account variant codependence LOG_INFO("doing first pass of vc\n"); @@ -559,16 +561,16 @@ void create_vcf() total_score += frequency_table[2][ref_genome->pt_seq[seq_number] + seq_position].score; total_score += frequency_table[3][ref_genome->pt_seq[seq_number] + seq_position].score; total_coverage += total_score; + total_cov_squared += total_score*total_score; //total_score += frequency_table[4][ref_genome->pt_seq[seq_number] + seq_position].score; if (total_score == 0) { uncovered_nucleotides++; } else if (total_score < 10) { badly_covered_nucleotides++; - } else { + } else if (total_score < 90) { well_covered_nucleotides++; - if (total_score > 50) { - overly_covered_nucleotides++; - } + } else { + overly_covered_nucleotides++; if (total_score > max_coverage) { max_coverage = total_score; chromosome_most_coverage = seq_number; @@ -601,17 +603,20 @@ void create_vcf() (long)badly_covered_nucleotides, (long)badly_covered_nucleotides*100/total_nucleotides, (long)badly_covered_nucleotides*10000/total_nucleotides%100); - printf("\twell covered nucleotides (10 reads or more): %lu (%lu.%lu%%)\n", + printf("\twell covered nucleotides (10 to 90 reads): %lu (%lu.%lu%%)\n", (long)well_covered_nucleotides, (long)well_covered_nucleotides*100/total_nucleotides, (long)well_covered_nucleotides*10000/total_nucleotides%100); - printf("\toverly covered nucleotides (more than 50 reads): %lu (%lu.%lu%%)\n", + printf("\toverly covered nucleotides (more than 90 reads): %lu (%lu.%lu%%)\n", (long)overly_covered_nucleotides, (long)overly_covered_nucleotides*100/total_nucleotides, (long)overly_covered_nucleotides*10000/total_nucleotides%100); printf("\tmax coverage: %u reads\n", max_coverage); printf("\tmax coverage position: chr%u:%u\n", chromosome_most_coverage, position_most_coverage); - printf("\ttotal coverage: %u (eq %lu reads; or %lux coverage)\n", total_coverage, (long)total_coverage/SIZE_READ, (long)total_coverage/total_nucleotides); + printf("\ttotal coverage: %lu (eq %lu reads; or %lux coverage)\n", total_coverage, (long)total_coverage/SIZE_READ, (long)total_coverage/total_nucleotides); + double mean = ((double)total_coverage) / (double) total_nucleotides; + printf("\tmean cov: %f (std dev: %f)\n", mean, sqrt((double)total_cov_squared/(double)total_nucleotides - mean*mean)); printf("\tnumber of variants: %d (multiple %d)\n", nb_variant, nb_pos_multiple_var); printf("\ttime: %lf s\n", my_clock() - start_time); + fflush(stdout); } diff --git a/tests/igvlike-focus.py b/tests/igvlike-focus.py index 0e861f3..a4887fb 100755 --- a/tests/igvlike-focus.py +++ b/tests/igvlike-focus.py @@ -117,7 +117,9 @@ def get_intersecting_mappings(file_names, chromosome, start, end, max_read_size) current_chr, index, address = line.split() index = int(index) address = int(address) - if current_chr == chromosome and index >= start: + if current_chr == chromosome and index >= start or current_chr != chromosome and last_chr == chromosome: + if verbose>1: + print("starting reading ("+current_chr+":"+str(index)+") at "+str(address)) break mmap_map_file.seek(int(last_address)) yielded_line = False From d570f608abb5f1832f3c3eb02cb86f9d37d6477e Mon Sep 17 00:00:00 2001 From: amoisson Date: Fri, 6 May 2022 16:05:39 +0200 Subject: [PATCH 37/48] updated some constants to the best found yet for single-mapping --- host/src/vartree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/host/src/vartree.c b/host/src/vartree.c index 2a080af..1c7ce9b 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -359,8 +359,8 @@ __attribute__((unused)) uint32_t depth_filter_fixed_3_f15(float freq) { return 3; } -#define AFFINE_B 1.37459 -#define AFFINE_A 0.0646469 +#define AFFINE_B 1.98142 +#define AFFINE_A 0.164761 float reverse_filter(uint32_t score) { return AFFINE_A*(float)score + AFFINE_B; @@ -424,8 +424,8 @@ static void add_codependence_to_freq_table(struct frequency_info** frequency_tab } } -#define POSITIVE_COD_INFLUENCE 0.00687069 -#define NEGATIVE_COD_INFLUENCE -0.236244 +#define POSITIVE_COD_INFLUENCE 0.0212905 +#define NEGATIVE_COD_INFLUENCE -0.580356 //TODO here read frequency table and write vcf (take max of frequency table to find substitution if any) void create_vcf() From 5cef587b13458845e030c2d36eb32f878cef56a7 Mon Sep 17 00:00:00 2001 From: amoisson Date: Wed, 18 May 2022 14:58:38 +0200 Subject: [PATCH 38/48] reduced max_score back to 40. It seems to have improved both algorithms Oo --- dpu/src/task.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpu/src/task.c b/dpu/src/task.c index dd53a12..949bfe9 100644 --- a/dpu/src/task.c +++ b/dpu/src/task.c @@ -56,9 +56,9 @@ __host dpu_compute_time_t DPU_COMPUTE_TIME_VAR; * @brief Maximum score allowed. */ #if SIZE_READ>120 -#define MAX_SCORE 80 +#define MAX_SCORE 40 #else -#define MAX_SCORE 80 +#define MAX_SCORE 40 #endif /** From 0ecebfaedc2e65efc8de5b4e438fc489aad169e2 Mon Sep 17 00:00:00 2001 From: amoisson Date: Thu, 19 May 2022 09:46:22 +0200 Subject: [PATCH 39/48] back to 120 read size for integration purposes --- common/inc/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/inc/common.h b/common/inc/common.h index 0e66dc1..e5bbde5 100644 --- a/common/inc/common.h +++ b/common/inc/common.h @@ -18,7 +18,7 @@ #define MAX_DPU_RESULTS (1 << 19) #define MAX_RESULTS_PER_READ (1 << 10) -#define SIZE_READ 148 +#define SIZE_READ 120 #define SIZE_SEED 14 #define SIZE_NEIGHBOUR_IN_BYTES ((SIZE_READ - SIZE_SEED) / 4) #define DELTA_NEIGHBOUR(round) ((SIZE_SEED * round) / 4) From 217906229109150cd073a969d9198d60e7d1b635 Mon Sep 17 00:00:00 2001 From: amoisson Date: Thu, 19 May 2022 14:55:42 +0200 Subject: [PATCH 40/48] removed a few ugly gotos and commented out code --- host/src/vartree.c | 90 ++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 56 deletions(-) diff --git a/host/src/vartree.c b/host/src/vartree.c index 6dfdfdf..40d21eb 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -114,13 +114,13 @@ void variant_tree_insert(variant_t *var, uint32_t seq_nr, uint32_t offset_in_chr vars->depth++; vars->score += var->score; free(var); - goto end; + pthread_mutex_unlock(&mutex); + return; } vars = vars->next; } var->next = *entry; *entry = var; -end: pthread_mutex_unlock(&mutex); } @@ -235,46 +235,25 @@ static bool print_var_from_freq_table(variant_t *var, uint32_t seq_nr, uint64_t uint32_t cov = ref_genome->mapping_coverage[genome_pos]; uint32_t depth = var->depth; uint32_t score = var->score / depth; - // Note: commenting out the old version of variant calling, now using the frequency table - //uint32_t percentage = 100; - //if (cov != 0) { - // percentage = depth * 100 / cov; - //} uint32_t ref_len = strlen(var->ref); uint32_t alt_len = strlen(var->alt); - //if (ref_len > alt_len && percentage <= 25 && homopolymer(&ref_genome->data[genome_pos - 12], 12)) { - // return false; - //} - - if (get_no_filter()) - goto print; - - if (ref_len == alt_len) { /* SUBSTITUTION */ - //if (depth < 3) { - // return false; - //} else if (depth > 20) { - // depth = 20; - //} - //if (!(score <= sub_filter[depth].score && percentage >= sub_filter[depth].percentage)) { - // return false; - //} - if (depth > 20) { - depth = 20; - } - } else { /* INSERTION OR DELETION */ - if (depth < 2) { - return false; - } else if (depth > 11) { - depth = 11; + + if (!get_no_filter()) { + + if (ref_len == alt_len) { /* SUBSTITUTION */ + if (depth > 20) { + depth = 20; + } + } else { /* INSERTION OR DELETION */ + if (depth < 2) { + return false; + } else if (depth > 11) { + depth = 11; + } } - //if (!(score <= indel_filter[depth].score && percentage >= indel_filter[depth].percentage)) { - // return false; - //} } -print: - //TODO fprintf(vcf_file, "%s\t%lu\t.\t%s\t%s\t.\t.\tDEPTH=%d;COV=%d;SCORE=%d\n", chr, seq_pos+1, var->ref, var->alt, var->depth, cov, score); @@ -299,30 +278,29 @@ static bool print_variant_tree(variant_t *var, uint32_t seq_nr, uint64_t seq_pos return false; } - if (get_no_filter()) - goto print; + if (!get_no_filter()) { - if (ref_len == alt_len) { /* SUBSTITUTION */ - if (depth < 3) { - return false; - } else if (depth > 20) { - depth = 20; - } - if (!(score <= sub_filter[depth].score && percentage >= sub_filter[depth].percentage)) { - return false; - } - } else { /* INSERTION OR DELETION */ - if (depth < 2) { - return false; - } else if (depth > 11) { - depth = 11; - } - if (!(score <= indel_filter[depth].score && percentage >= indel_filter[depth].percentage)) { - return false; + if (ref_len == alt_len) { /* SUBSTITUTION */ + if (depth < 3) { + return false; + } else if (depth > 20) { + depth = 20; + } + if (!(score <= sub_filter[depth].score && percentage >= sub_filter[depth].percentage)) { + return false; + } + } else { /* INSERTION OR DELETION */ + if (depth < 2) { + return false; + } else if (depth > 11) { + depth = 11; + } + if (!(score <= indel_filter[depth].score && percentage >= indel_filter[depth].percentage)) { + return false; + } } } -print: fprintf(vcf_file, "%s\t%lu\t.\t%s\t%s\t.\t.\tDEPTH=%d;COV=%d;SCORE=%d\n", chr, seq_pos+1, var->ref, var->alt, var->depth, cov, score); From 080bb41debf12b6768b58145953bc19147259f8a Mon Sep 17 00:00:00 2001 From: amoisson Date: Thu, 19 May 2022 17:49:46 +0200 Subject: [PATCH 41/48] fixed a TODO or two --- host/inc/processread.h | 1 + host/src/processread.c | 7 +- host/src/processread.c.old | 1099 ------------------------------------ 3 files changed, 4 insertions(+), 1103 deletions(-) delete mode 100644 host/src/processread.c.old diff --git a/host/inc/processread.h b/host/inc/processread.h index ded0249..dae9cc3 100644 --- a/host/inc/processread.h +++ b/host/inc/processread.h @@ -14,6 +14,7 @@ typedef struct { } backtrack_t; +#define CODE_MATCH 0 #define CODE_SUB 10 #define CODE_DEL 11 #define CODE_INS 12 diff --git a/host/src/processread.c b/host/src/processread.c index 33e08fd..e0fced6 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -327,7 +327,7 @@ int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, backtrack_t ** backtrack // Operation 0 : sequences match and nothing was done : decrease both indices. i--; j--; - (*backtrack_end)->type = 0; // FIXME : use CODE_??? instead of 0 + (*backtrack_end)->type = CODE_MATCH; (*backtrack_end)->ix = i; (*backtrack_end)->jx = j; (*backtrack_end)++; @@ -457,7 +457,7 @@ bool update_frequency_table( frequency_table[read_letter][current_position].unsure_score++; } break; - case 0: // FIXME : CODE_MATCH + case CODE_MATCH: // FIXME : CODE_MATCH read_letter = read[backtrack_end->jx]; if (read_letter > 3) { read_letter = read_letter>>1 & 0x3; @@ -476,7 +476,6 @@ bool update_frequency_table( //LOG_WARN("unhandled indel\n"); // TODO: handle indels break; - // TODO: handle errors even if they should never happen } } STAT_RECORD_STEP(STAT_UPDATE_FREQUENCY_TABLE, 4); @@ -601,7 +600,7 @@ static void set_variant(dpu_result_out_t result_match, genome_t *ref_genome, int if (backtrack_end->type != 0) { variant_t *newvar = (variant_t *)malloc(sizeof(variant_t)); newvar->depth = 1; - newvar->score = 1; + newvar->score = result_match.score; newvar->next = NULL; int alt_idx = 0; int ref_idx = 0; diff --git a/host/src/processread.c.old b/host/src/processread.c.old deleted file mode 100644 index a7303fd..0000000 --- a/host/src/processread.c.old +++ /dev/null @@ -1,1099 +0,0 @@ -/** - * Copyright 2016-2019 - Dominique Lavenier & UPMEM - */ - -#include -#include -#include -#include -#include -#include - -#include "accumulateread.h" -#include "common.h" -#include "debug.h" -#include "genome.h" -#include "getread.h" -#include "processread.h" -#include "mapping_file.h" -#include "upvc.h" -#include "vartree.h" - -#define DEBUG_READ_MAPPING true - -#define SIZE_INSERT_MEAN (400) -#define SIZE_INSERT_STD (3 * 50) - -#define CODE_A 0 /* ('A'>>1)&3 41H 0100 0001 */ -#define CODE_C 1 /* ('C'>>1)&3 43H 0100 0011 */ -#define CODE_T 2 /* ('T'>>1)&3 54H 0101 0100 */ -#define CODE_G 3 /* ('G'>>1)&3 47H 0100 0111 */ - -#define PQD_INIT_VAL (99) - -#if SIZE_READ>120 -#define MAX_SUBSTITUTION 31 -#else -#define MAX_SUBSTITUTION 20 -#endif - -static bool flag_dbg = false; - -typedef struct { - int type; - int ix; - int jx; -} backtrack_t; - -static int min(int a, int b) { return a < b ? a : b; } - -static void DPD_compute( - int s1, int s2, int *Dij, int Dijm, int Dimj, int Dimjm, int *Pij, int Pijm, int *Qij, int Qimj, int *xij) -{ - int min_QP, d; - - *Pij = min(Dijm + COST_GAPO, Pijm + COST_GAPE); - *Qij = min(Dimj + COST_GAPO, Qimj + COST_GAPE); - *xij = 0; - - int x; - if (*Pij < *Qij) { - min_QP = *Pij; - x = 2; - } else { - min_QP = *Qij; - x = 3; - } - d = Dimjm; - if ((s1 & 3) != (s2 & 3)) { - d += COST_SUB; - *xij = 1; - } - if (d < min_QP) { - *Dij = d; - } else { - *Dij = min_QP; - *xij = x; - } -} - -int DPD(int8_t *s1, int8_t *s2, backtrack_t *backtrack, int size_neighbour_in_symbols) -{ - int matrix_size = size_neighbour_in_symbols + 1; - int diagonal = (NB_DIAG / 2) + 1; - int D[matrix_size][matrix_size]; - int P[matrix_size][matrix_size]; - int Q[matrix_size][matrix_size]; - int X[matrix_size][matrix_size]; - int min_score = PQD_INIT_VAL; - int min_score_i_idx = 0; - int min_score_j_idx = 0; - int align_distance = 1; - - for (int i = 0; i < matrix_size; i++) { - for (int j = 0; j < matrix_size; j++) { - D[i][j] = 0; - } - } - for (int i = 0; i <= diagonal; i++) { - P[i][0] = PQD_INIT_VAL; - P[0][i] = PQD_INIT_VAL; - Q[i][0] = PQD_INIT_VAL; - Q[0][i] = PQD_INIT_VAL; - D[i][0] = i * COST_SUB; - D[0][i] = i * COST_SUB; - } - - for (int i = 1; i < diagonal; i++) { - for (int j = 1; j < i + diagonal; j++) { - DPD_compute(s1[i - 1], s2[j - 1], &D[i][j], D[i][j - 1], D[i - 1][j], D[i - 1][j - 1], &P[i][j], P[i][j - 1], - &Q[i][j], Q[i - 1][j], &X[i][j]); - } - Q[i][i + diagonal] = PQD_INIT_VAL; - D[i][i + diagonal] = PQD_INIT_VAL; - } - for (int i = diagonal; i < matrix_size - diagonal; i++) { - P[i][i - diagonal] = PQD_INIT_VAL; - D[i][i - diagonal] = PQD_INIT_VAL; - for (int j = i - diagonal + 1; j < i + diagonal; j++) { - DPD_compute(s1[i - 1], s2[j - 1], &D[i][j], D[i][j - 1], D[i - 1][j], D[i - 1][j - 1], &P[i][j], P[i][j - 1], - &Q[i][j], Q[i - 1][j], &X[i][j]); - } - Q[i][i + diagonal] = PQD_INIT_VAL; - D[i][i + diagonal] = PQD_INIT_VAL; - } - - for (int i = matrix_size - diagonal; i < matrix_size; i++) { - P[i][i - diagonal] = PQD_INIT_VAL; - D[i][i - diagonal] = PQD_INIT_VAL; - for (int j = i - diagonal + 1; j < matrix_size; j++) { - DPD_compute(s1[i - 1], s2[j - 1], &D[i][j], D[i][j - 1], D[i - 1][j], D[i - 1][j - 1], &P[i][j], P[i][j - 1], - &Q[i][j], Q[i - 1][j], &X[i][j]); - } - if (D[i][matrix_size - 1] < min_score) { - min_score = D[i][matrix_size - 1]; - min_score_i_idx = i; - min_score_j_idx = matrix_size - 1; - } - } - for (int j = matrix_size - diagonal; j < matrix_size; j++) { - if (D[matrix_size - 1][j] < min_score) { - min_score = D[matrix_size - 1][j]; - min_score_i_idx = matrix_size - 1; - min_score_j_idx = j; - } - } - - { - int i = min_score_i_idx; - int j = min_score_j_idx; - backtrack[0].type = CODE_END; - while ((i > 0) && (j > 0)) { - if(X[i][j] == 0) { - i--; - j--; - } else { - if(X[i][j] == 1) { - i--; - j--; - backtrack[align_distance].type = CODE_SUB; - backtrack[align_distance].ix = i; - backtrack[align_distance].jx = j; - align_distance++; - } else { - if(X[i][j] == 2) { - j--; - backtrack[align_distance].type = CODE_INS; - backtrack[align_distance].ix = i; - backtrack[align_distance].jx = j; - align_distance++; - } - else { - i--; - backtrack[align_distance].type = CODE_DEL; - backtrack[align_distance].ix = i; - backtrack[align_distance].jx = j; - align_distance++; - } - } - } - } - } - - return align_distance; -} - -/* - * encoding of differences between read and sequence of the reference genome. - * coding: substitution CODE_SUB pos x - * deletion CODE_DEL pos x+ - * insertion CODE_INS pos x+ - * end CODE_END - * - * x = A | C | G | T - * x+ = a sequence of at least 1 element (i.e. A, C, G ou T) - * pos = integer (8 bits) : give the offset of the variant from the start of the read - * - * example S 12 A D 56 A T G I 87 T C X ==> substitution (A) position 12, deletion (ATG) position 56, insertion (TC) position 87 - * The code is return in "code" as a table of int8_t - */ - -#ifdef USE_INDEL -static int code_alignment(uint8_t *code, int score, int8_t *gen, int8_t *read, unsigned size_neighbour_in_symbols, bool *flag) -{ - int code_idx, computed_score, backtrack_idx; - int size_read = SIZE_READ; - int size_neighbour = size_neighbour_in_symbols; - backtrack_t backtrak[size_read]; - - *flag = false; - - if (score == 0) { - code[0] = CODE_END; - return 1; - } - - /* First, looking for subsititution only */ - code_idx = 0; - computed_score = 0; - for (int i = SIZE_SEED; i < size_neighbour + SIZE_SEED; i++) { - if ((gen[i] & 3) != read[i]) { - computed_score += COST_SUB; - code[code_idx++] = CODE_SUB; - code[code_idx++] = i; - code[code_idx++] = read[i]; - if (computed_score > score) { - break; - } - } - } - code[code_idx++] = CODE_END; - if (computed_score == score) - return code_idx; - - /* Otherwise, re-compute the matrix (only some diagonals) and put in backtrack the path */ - backtrack_idx = DPD(gen, read, backtrak, size_neighbour_in_symbols + SIZE_SEED); - if (backtrack_idx == -1) { - code[0] = CODE_ERR; - return 1; - } - - backtrack_idx--; - code_idx = 0; - while (backtrack_idx > 0) { - if (backtrak[backtrack_idx].type == CODE_SUB) { - code[code_idx++] = CODE_SUB; - code[code_idx++] = backtrak[backtrack_idx].jx - 1; - code[code_idx++] = read[backtrak[backtrack_idx].jx - 1]; - backtrack_idx--; - } else { - if (backtrak[backtrack_idx].type == CODE_DEL) { - int backtrack_jx = backtrak[backtrack_idx].jx; - code[code_idx++] = CODE_DEL; - code[code_idx++] = backtrak[backtrack_idx].ix; - code[code_idx++] = gen[backtrak[backtrack_idx].ix] & 3; - backtrack_idx--; - while ((backtrak[backtrack_idx].type == CODE_DEL) && (backtrack_jx == backtrak[backtrack_idx].jx)) { - code[code_idx++] = gen[backtrak[backtrack_idx].ix] & 3; - backtrack_idx--; - } - } else { - int backtrack_ix = backtrak[backtrack_idx].ix; - code[code_idx++] = CODE_INS; - code[code_idx++] = backtrak[backtrack_idx].jx - 1; - code[code_idx++] = read[backtrak[backtrack_idx].jx]; - backtrack_idx--; - while ((backtrak[backtrack_idx].type == CODE_INS) && (backtrack_ix == backtrak[backtrack_idx].ix)) { - code[code_idx++] = read[backtrak[backtrack_idx].jx]; - backtrack_idx--; - } - } - } - } - code[code_idx++] = CODE_END; - return code_idx; -} -#endif - -#if 0 -static void set_variant( - dpu_result_out_t result_match, genome_t *ref_genome, int8_t *reads_buffer, unsigned int size_neighbour_in_symbols) -{ - uint32_t code_result_idx; - uint8_t code_result_tab[256]; - int8_t *read; - char nucleotide[4] = { 'A', 'C', 'T', 'G' }; - uint64_t genome_pos = ref_genome->pt_seq[result_match.coord.seq_nr] + result_match.coord.seed_nr; - int size_read = SIZE_READ; - //LOG_TRACE("set_variant called\n"); - - /* Get the differences betweend the read and the sequence of the reference genome that match */ - read = &reads_buffer[result_match.num * size_read]; - code_alignment(code_result_tab, result_match.score, &ref_genome->data[genome_pos], read, size_neighbour_in_symbols); - if (code_result_tab[0] == CODE_ERR) - return; - - /* Update "mapping_coverage" with the number of reads that match at this position of the genome */ - for (int i = 0; i < size_read; i++) { - ref_genome->mapping_coverage[genome_pos + i] += 1; - } - -#if DEBUG_READ_MAPPING - // TODO: check genome_pos is the expected value - write_read_mapping(genome_pos, code_result_tab); -#endif - - code_result_idx = 0; - while (code_result_tab[code_result_idx] != CODE_END) { - int code_result = code_result_tab[code_result_idx]; - //LOG_DEBUG("code_result=%d\n", code_result); - int64_t pos_variant_read = code_result_tab[code_result_idx + 1]; - int64_t pos_variant_genome = genome_pos + pos_variant_read; - int ref_pos = 0; - int alt_pos = 0; - variant_t *newvar = (variant_t *)malloc(sizeof(variant_t)); - newvar->depth = 1; - newvar->score = result_match.score; - newvar->next = NULL; - if (code_result == CODE_SUB) { - /* SNP = 0,1,2,3 (code A,C,T,G) */ - int snp = code_result_tab[code_result_idx + 2]; - newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3]; - newvar->alt[alt_pos++] = nucleotide[snp & 3]; - - code_result_idx += 3; - } else if (code_result == CODE_INS) { - int64_t ps_var_genome = pos_variant_genome; - int64_t ps_var_read = pos_variant_read; - code_result_idx += 2; - - while (code_result_tab[code_result_idx] < 4) { - ps_var_read++; - code_result_idx++; - } - - while (ref_genome->data[ps_var_genome] == read[ps_var_read]) { - ps_var_genome--; - ps_var_read--; - pos_variant_genome--; - pos_variant_read--; - } - - newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3]; - - while (pos_variant_read <= ps_var_read) { - newvar->alt[alt_pos++] = nucleotide[read[pos_variant_read] & 3]; - if (alt_pos >= MAX_SIZE_ALLELE - 1) { - free(newvar); - return; - } - pos_variant_read++; - } - - } else if (code_result == CODE_DEL) { - int64_t ps_var_genome = pos_variant_genome; - int64_t ps_var_read = pos_variant_read; - code_result_idx += 2; - - while (code_result_tab[code_result_idx] < 4) { - ps_var_genome++; - code_result_idx++; - } - - while (ref_genome->data[ps_var_genome] == read[ps_var_read]) { - ps_var_read--; - ps_var_genome--; - pos_variant_genome--; - pos_variant_read--; - } - - newvar->alt[alt_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3]; - - while (pos_variant_genome <= ps_var_genome) { - newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3]; - if (ref_pos >= MAX_SIZE_ALLELE - 1) { - free(newvar); - //LOG_TRACE("set_variant early return\n"); - return; - } - pos_variant_genome++; - } - pos_variant_genome -= ref_pos; - } - newvar->ref[ref_pos] = '\0'; - newvar->alt[alt_pos] = '\0'; - variant_tree_insert( - newvar, result_match.coord.seq_nr, pos_variant_genome + 1 - ref_genome->pt_seq[result_match.coord.seq_nr]); - } - //LOG_TRACE("set_variant return\n"); -} -#endif - -static pthread_mutex_t non_mapped_mutex; -static void add_to_non_mapped_read(int numread, int round, FILE *fpe1, FILE *fpe2, int8_t *reads_buffer) -{ - if (fpe1 == NULL || fpe2 == NULL) - return; - pthread_mutex_lock(&non_mapped_mutex); - char nucleotide[4] = { 'A', 'C', 'T', 'G' }; - int size_read = SIZE_READ; - int8_t *read = &reads_buffer[numread * size_read]; - fprintf(fpe1, ">>%d\n", SIZE_SEED * (round + 1)); - for (int j = SIZE_SEED; j < size_read; j++) { - fprintf(fpe1, "%c", nucleotide[read[j] & 3]); - } - for (int j = 0; j < SIZE_SEED; j++) { - fprintf(fpe1, "A"); - } - fprintf(fpe1, "\n"); - read = &reads_buffer[(numread + 2) * size_read]; - fprintf(fpe2, ">>%d\n", SIZE_SEED * (round + 1)); - for (int j = SIZE_SEED; j < size_read; j++) { - fprintf(fpe2, "%c", nucleotide[read[j] & 3]); - } - for (int j = 0; j < SIZE_SEED; j++) { - fprintf(fpe2, "A"); - } - fprintf(fpe2, "\n"); - pthread_mutex_unlock(&non_mapped_mutex); -} - -#ifdef USE_INDEL -int get_read_update_positions( - uint64_t * update_genome_position, - dpu_result_out_t *result_tab, - int pos, - genome_t *ref_genome, - uint64_t genome_pos, - int8_t *read, - __attribute__((unused))int size_neighbour_in_symbols, - bool * flag, - bool debug, - uint32_t * substCnt, - char * chromosome_name) { - - // run smith and waterman algorithm to find indels - uint8_t code_result_tab[256]; - code_alignment(code_result_tab, result_tab[pos].score, &ref_genome->data[genome_pos], read, size_neighbour_in_symbols, flag); - write_read_mapping(chromosome_name, result_tab[pos].coord.seed_nr, code_result_tab, (uint8_t*) read); - for(int read_pos = 0; read_pos < SIZE_READ; ++read_pos) { - update_genome_position[read_pos] = 0; - } - if (code_result_tab[0] != CODE_ERR) { - - // array that contains for each read position, the genome position that it matches with - // This is the genome position that will be updated in the frequency table - // This genome position takes into account the shift due to possible indels found - // with smith-waterman algorithm - int code_result_index = 0; - int ref_pos = 0; - int nbIndels = 0; - bool ins = false; - while (code_result_tab[code_result_index] != CODE_END) { - int code_result = code_result_tab[code_result_index]; - int64_t pos_variant_read = code_result_tab[code_result_index + 1]; - /*printf("pos variant: %lu\n", pos_variant_read);*/ - int64_t pos_variant_genome = genome_pos + pos_variant_read; - if (code_result == CODE_SUB) { - // do nothing for substitution - code_result_index += 3; - (*substCnt)++; - ref_pos++; - } - else if (code_result == CODE_INS) { - ins = true; - int64_t ps_var_genome = pos_variant_genome; - int64_t ps_var_read = pos_variant_read; - code_result_index += 2; - - while (code_result_tab[code_result_index] < 4) { - ps_var_read++; - code_result_index++; - } - - while (ref_genome->data[ps_var_genome] == read[ps_var_read] && ps_var_genome - && pos_variant_read) { - assert(ps_var_genome && ps_var_read && pos_variant_genome && pos_variant_read); - ps_var_genome--; - ps_var_read--; - pos_variant_genome--; - pos_variant_read--; - } - - /*newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3];*/ - ref_pos++; - - // skip first value which should be the equivalent of first element in ref genome - pos_variant_read++; - while (pos_variant_read <= ps_var_read) { - // position should not be updated yet - if(update_genome_position[pos_variant_read] != 0) { - LOG_WARN("duplicate update (Insertion) at position %lu. Current %lu\n", - pos_variant_read, update_genome_position[pos_variant_read]); - fflush(stdout); - return -1; - } - update_genome_position[pos_variant_read++] = UINT64_MAX; - } - ++nbIndels; - } - else if (code_result == CODE_DEL) { - - int64_t ps_var_genome = pos_variant_genome; - int64_t ps_var_read = pos_variant_read; - code_result_index += 2; - - while (code_result_tab[code_result_index] < 4) { - ps_var_genome++; - code_result_index++; - } - - while (ref_genome->data[ps_var_genome] == read[ps_var_read] && pos_variant_genome && ps_var_read) { - assert(ps_var_genome && ps_var_read && pos_variant_genome && pos_variant_read); - ps_var_read--; - ps_var_genome--; - pos_variant_genome--; - pos_variant_read--; - } - - // on a deletion store the threshold to apply from the current read position - assert(ps_var_genome > pos_variant_genome); - if(pos_variant_read + 1 < SIZE_READ) { - // position should not be updated yet - if(update_genome_position[pos_variant_read+1] != 0) { - LOG_WARN("duplicate update (Deletion) at position %lu. Current %lu\n", - pos_variant_read+1, update_genome_position[pos_variant_read+1]); - fflush(stdout); - return -1; - } - /*assert(update_genome_position[pos_variant_read+1] == 0);*/ - update_genome_position[pos_variant_read + 1] = ps_var_genome - pos_variant_genome; - } - while (pos_variant_genome <= ps_var_genome) { - pos_variant_genome++; - ref_pos++; - } - pos_variant_genome -= ref_pos; - ++nbIndels; - } - else - assert(0); - } - - // debug prints - if(nbIndels && debug) - printf("SW algorithm (nbIndels %d) ins %d:\n", nbIndels, ins); - int64_t curr_pos = genome_pos; - for(int read_pos = 0; read_pos < SIZE_READ; ++read_pos) { - switch(update_genome_position[read_pos]) { - case 0: - update_genome_position[read_pos] = curr_pos++; - if(nbIndels && debug) - printf(" "); - break; - case UINT64_MAX: - if(nbIndels && debug) - printf("I"); - break; - default: - if(nbIndels && debug) { - for(uint64_t print_index = 0; print_index < update_genome_position[read_pos]; ++print_index) - printf("D"); - } - curr_pos += update_genome_position[read_pos]; - update_genome_position[read_pos] = curr_pos++; - } - } - - if(nbIndels && debug) { - printf("\n"); - fflush(stdout); - } - return nbIndels; - } - else - assert(0); - - return false; -} -#endif - - -static pthread_mutex_t freq_table_mutex; - -/** - * function to update frequency table used for variant calling - **/ -bool update_frequency_table( - genome_t *ref_genome, - dpu_result_out_t *result_tab, - int8_t *reads_buffer, - float *reads_quality_buffer, - int pos, - float mapq, - __attribute__((unused))int size_neighbour_in_symbols) { - - struct frequency_info **frequency_table = get_frequency_table(); - uint64_t genome_pos = ref_genome->pt_seq[result_tab[pos].coord.seq_nr] + result_tab[pos].coord.seed_nr; - int num = result_tab[pos].num; - int8_t *read = reads_buffer + (num * SIZE_READ); - float *read_quality = reads_quality_buffer + (num/2 * SIZE_READ); - //TODO: read the quality in the correct order (inverted or not) - bool inv = num & 1; - //TODO: assume no offset here - -#ifdef USE_INDEL - - static bool debug = false; - static char nucleotide[4] = { 'A', 'C', 'T', 'G' }; - uint64_t update_genome_position[SIZE_READ]; - uint32_t substCnt = 0; - flag_dbg = false; - - // for simplicity put all this in a critical section protected by a mutex - // since the frequency table is shared (but inefficient) - - pthread_mutex_lock(&freq_table_mutex); - int nbIndels = get_read_update_positions(update_genome_position, result_tab, pos, - ref_genome, genome_pos, read, size_neighbour_in_symbols, &flag_dbg, debug, &substCnt, ref_genome->seq_name[result_tab[pos].coord.seq_nr]); - bool hasIndel = nbIndels > 0; - - // debug prints - if(hasIndel && debug) { - - printf("Read:\n"); - for(int k = 0; k < SIZE_READ; ++k) { - printf("%c", nucleotide[read[k]]); - } - printf("\ngenome (pos %u:%u):\n", result_tab[pos].coord.seed_nr, result_tab[pos].coord.seed_nr+SIZE_READ); - for(uint64_t k = genome_pos; k < genome_pos + SIZE_READ; ++k) { - printf("%c", nucleotide[ref_genome->data[k]]); - } - printf("\nupdate pos:\n"); - uint64_t lastpos = 0; - for(uint64_t k = 0; k < SIZE_READ; ++k) { - if(k && update_genome_position[k] != lastpos+1) { - if(update_genome_position[k] == UINT64_MAX) - printf("X"); - /*printf("No update at position %lu\n", k);*/ - else if(lastpos == UINT64_MAX) - /*printf("New start at pos %lu = %lu / %c\n", k, update_genome_position[k], nucleotide[ref_genome->data[update_genome_position[k]]]);*/ - printf("%c", nucleotide[ref_genome->data[update_genome_position[k]]]); - else - /*printf("Change at pos %lu, diff %ld, %c\n", k, update_genome_position[k] - lastpos, nucleotide[ref_genome->data[update_genome_position[k]]]);*/ - printf("%c", nucleotide[ref_genome->data[update_genome_position[k]]]); - } - /*else if(nucleotide[ref_genome->data[update_genome_position[k]]] != nucleotide[read[k]]) {*/ - /*printf(RED "%c" RESET, nucleotide[ref_genome->data[update_genome_position[k]]]);*/ - /*}*/ - else - printf("%c", nucleotide[ref_genome->data[update_genome_position[k]]]); - lastpos = update_genome_position[k]; - } - - printf("\nsubst:\n"); - for(uint64_t k = 0; k < SIZE_READ; ++k) { - if(update_genome_position[k] == UINT64_MAX) { - printf(" "); - continue; - } - else if(nucleotide[ref_genome->data[update_genome_position[k]]] != nucleotide[read[k]]) { - printf("U"); - substCnt++; - } - else - printf(" "); - } - printf("\n\n"); - fflush(stdout); - assert(!result_tab[pos].coord.nodp); - } - else if(debug) { - if(!result_tab[pos].coord.nodp) { - LOG_WARN("odpd result with no indels detected (flag = %d, subst cnt %u)):\n", flag_dbg, substCnt); - printf("Read:\n"); - for(int k = 0; k < SIZE_READ; ++k) { - printf("%c", nucleotide[read[k]]); - } - printf("\ngenome (pos %u:%u):\n", result_tab[pos].coord.seed_nr, result_tab[pos].coord.seed_nr+SIZE_READ); - for(uint64_t k = genome_pos; k < genome_pos + SIZE_READ; ++k) { - printf("%c", nucleotide[ref_genome->data[k]]); - } - printf("\n\n"); - fflush(stdout); - } - } - - /*pthread_mutex_lock(&freq_table_mutex);*/ - // for the moment support only one indel otherwise we have some issue FIXME - if(substCnt <= MAX_SUBSTITUTION && (hasIndel || result_tab[pos].coord.nodp) && nbIndels >= 0) { - for(uint64_t k = 0; k < SIZE_READ; ++k) { - uint64_t update_genome_pos = update_genome_position[k]; - if(update_genome_pos < genome_get()->fasta_file_size) { - frequency_table[read[k]][update_genome_pos].freq += mapq * read_quality[inv ? SIZE_READ - k - 1 : k]; - /*frequency_table[read[j]][genome_pos+j].score += result_tab[pos].score;*/ - frequency_table[read[k]][update_genome_pos].score++; - } - else if (update_genome_pos != UINT64_MAX) - LOG_WARN("genome update position computed is wrong %lu\n", update_genome_pos); - } - } - /*fflush(stdout);*/ - pthread_mutex_unlock(&freq_table_mutex); - return hasIndel; - -#else - pthread_mutex_lock(&freq_table_mutex); - for(int j = 0; j < SIZE_READ; ++j) { - if(genome_pos + j < genome_get()->fasta_file_size) { - frequency_table[read[j]][genome_pos+j].freq += mapq * read_quality[inv ? SIZE_READ - j - 1 : j]; - frequency_table[read[j]][genome_pos+j].score++; - } - else - LOG_WARN("reads matched at position that exceeds genome size\n"); - } - pthread_mutex_unlock(&freq_table_mutex); - return false; -#endif -} - -static volatile unsigned int curr_match; -static pthread_mutex_t curr_match_mutex; -unsigned int acquire_curr_match() -{ - pthread_mutex_lock(&curr_match_mutex); - return curr_match; -} -void release_curr_match(unsigned int new_curr_match) -{ - curr_match = new_curr_match; - pthread_mutex_unlock(&curr_match_mutex); - return; -} - -static pthread_barrier_t barrier; - -typedef struct { - unsigned int nb_match; - dpu_result_out_t *result_tab; - int round; - int8_t *reads_buffer; - float *reads_quality_buffer; - genome_t *ref_genome; - FILE *fpe1; - FILE *fpe2; -} process_read_arg_t; - -static uint64_t nr_reads_total = 0ULL; -static uint64_t nr_reads_total_from_dpus = 0ULL; -static uint64_t nr_reads_non_mapped = 0ULL; -static uint64_t nr_reads_with_indels = 0ULL; -static pthread_mutex_t nr_reads_mutex = PTHREAD_MUTEX_INITIALIZER; - -#define MISMATCH_COUNT(X) (X.score / 10) -#define INVALID_SCORE 1000 - -static void keep_best_2_scores(unsigned score, unsigned* P1, unsigned *P2, unsigned x1, unsigned x2, unsigned* best_score) { - - if(score < best_score[0]) { - - // move current to next position - best_score[1] = best_score[0]; - P1[1] = P1[0]; - P2[1] = P2[0]; - // update first position - best_score[0] = score; - P1[0] = x1; - P2[0] = x2; - } - else if (score < best_score[1]) { - - // update second position - best_score[1] = score; - P1[1] = x1; - P2[1] = x2; - } -} - -static unsigned get_nb_scores(unsigned int * best_score) { - - unsigned np = 0; - if(best_score[0] < INVALID_SCORE) { - np++; - if(best_score[1] < INVALID_SCORE) np++; - } - return np; -} - -/*#define USE_MAPQ_SCORE*/ -static void do_process_read(process_read_arg_t *arg) -{ - const unsigned int nb_match = arg->nb_match; - dpu_result_out_t *result_tab = arg->result_tab; - int round = arg->round; - int8_t *reads_buffer = arg->reads_buffer; - float *reads_quality_buffer = arg->reads_quality_buffer; - genome_t *ref_genome = arg->ref_genome; - FILE *fpe1 = arg->fpe1; - FILE *fpe2 = arg->fpe2; - unsigned int size_neighbour_in_symbols = (SIZE_NEIGHBOUR_IN_BYTES - DELTA_NEIGHBOUR(round)) * 4; - /*printf("size_neighbour_in_symbols : %u", size_neighbour_in_symbols);*/ - - /* - * The number of a pair is given by "num_read / 4 " (see dispatch_read function) - * Their type is given by their offset (see dispatch_read function) - * type = num_read%4 == 0 ==> read 1 - * 1 ==> read 1 complement - * 2 ==> read 2 - * 3 ==> read 2 complement - * The read pair to consider are [0, 3] and [1, 2]. - * - * NEW (04/2020): - * - more paired reads are considered - * - when different position mapping are possible, choose the less covered zone - */ - - while (true) { - unsigned int i; - if ((i = acquire_curr_match()) >= nb_match) { - release_curr_match(i); - return; - } - int numpair = result_tab[i].num / 4; - unsigned int j = i; - while ((j < nb_match) && (numpair == result_tab[j].num / 4)) { - j++; - } - release_curr_match(j); - - // i = start index in result_tab - // j = stop index in result_tab - // select best couples of paired reads - unsigned int P1[2]; - unsigned int P2[2]; - unsigned int pos1, pos2, t1, t2; - unsigned int best_score[2] = { 1000, 1000 }; - /*unsigned int best_score_all = 1000;*/ - // test all significant pairs of reads (0,3) & (1,2) - for (unsigned int x1 = i; x1 < j; x1++) { - t1 = result_tab[x1].num % 4; - pos1 = result_tab[x1].coord.seed_nr; - for (unsigned int x2 = i + 1; x2 < j; x2++) { - pos2 = result_tab[x2].coord.seed_nr; - t2 = result_tab[x2].num % 4; - if (t1 + t2 == 3) // select significant pair - { - if ((abs((int)pos2 - (int)pos1) > READ_DIST_LOWER_BOUND && (abs((int)pos2 - (int)pos1) < READ_DIST_UPPER_BOUND))) { - // update if this is one of the two best scores - keep_best_2_scores(result_tab[x1].score + result_tab[x2].score, P1, P2, x1, x2, best_score); - } - } - } - } - - bool update = false; - bool hasIndel = false; - - unsigned np = get_nb_scores(best_score); - if (np > 0) { - - if(np == 2) { - - // found at least 2 matching pairs of positions. Check the delta between the two pairs to - // decide whether we should keep the best pair - int delta = abs((int)(MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]])) - - (int)(MISMATCH_COUNT(result_tab[P1[1]]) + MISMATCH_COUNT(result_tab[P2[1]]))); - - float mapq = 1.0f; -#ifdef USE_MAPQ_SCORE - int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) - + MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((2 * (MAX_SUBSTITUTION + 1)) - delta); - if(delta_corrected < 0) { - LOG_WARN("negative delta for square root %d\n", delta_corrected); - } - else if(delta > DIST_PAIR_THRESHOLD) { - mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); -#else - if(delta > DIST_PAIR_THRESHOLD) { -#endif - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); - update = true; - } - } - else if(np) { // only one result, take it - int delta = abs((int)(MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]])) - (2 * (MAX_SUBSTITUTION + 1))); - float mapq = 1.0f; -#ifdef USE_MAPQ_SCORE - int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((2 * (MAX_SUBSTITUTION + 1)) - delta); - if(delta_corrected < 0) { - LOG_WARN("negative delta (np == 1) for square root %d\n", delta_corrected); - } - else if(delta > DIST_PAIR_THRESHOLD) { - mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); -#else - if(delta > DIST_PAIR_THRESHOLD) { -#endif - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); - update = true; - } - } - } - if(true) { - - // check mapping of R1 and R2 independently - unsigned int best_score_R1[2] = { 1000, 1000 }; - unsigned int best_score_R2[2] = { 1000, 1000 }; - P1[0] = 0; - P2[0] = 0; - P1[1] = 0; - P2[1] = 0; - for (unsigned int read = i; read < j; read++) { - unsigned t1 = result_tab[read].num % 4; - if(t1 < 2) { // PE1 or RPE1 - keep_best_2_scores(result_tab[read].score, P1, P2, read, 0, best_score_R1); - } - else { // PE2 or RPE2 - keep_best_2_scores(result_tab[read].score, P1, P2, 0, read, best_score_R2); - } - } - - unsigned np1 = get_nb_scores(best_score_R1), np2 = get_nb_scores(best_score_R2); - if(np1 == 2) { - - int delta = abs((int)MISMATCH_COUNT(result_tab[P1[0]]) - (int)MISMATCH_COUNT(result_tab[P1[1]])); - - float mapq = 1.0f; -#ifdef USE_MAPQ_SCORE - int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); - - if(delta_corrected < 0) { - LOG_WARN("negative delta (np1 == 2) for square root %d\n", delta_corrected); - } - else if(delta > DIST_SINGLE_THRESHOLD) { - mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); -#else - if(delta > DIST_SINGLE_THRESHOLD) { -#endif - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); - update = true; - } - } - else if(np1) { - int delta = abs((int)MISMATCH_COUNT(result_tab[P1[0]]) - (MAX_SUBSTITUTION + 1)); - - float mapq = 1.0f; -#ifdef USE_MAPQ_SCORE - int delta_corrected = MISMATCH_COUNT(result_tab[P1[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); - - if(delta_corrected < 0) { - LOG_WARN("negative delta (np1 == 1) for square root %d\n", delta_corrected); - } - else if(delta > DIST_SINGLE_THRESHOLD) { - mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); -#else - if(delta > DIST_SINGLE_THRESHOLD) { -#endif - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P1[0], mapq, size_neighbour_in_symbols); - update = true; - } - } - - if(np2 == 2) { - - int delta = abs((int)MISMATCH_COUNT(result_tab[P2[0]]) - (int)MISMATCH_COUNT(result_tab[P2[1]])); - - float mapq = 1.0f; -#ifdef USE_MAPQ_SCORE - int delta_corrected = MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); - - if(delta_corrected < 0) { - LOG_WARN("negative delta (np2 == 2) for square root %d, %d %d %d %d\n", - delta_corrected, MISMATCH_COUNT(result_tab[P2[0]]), MISMATCH_COUNT(result_tab[P2[1]]), MAX_SUBSTITUTION + 1, delta); - } - else if(delta > DIST_SINGLE_THRESHOLD) { - mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); -#else - if(delta > DIST_SINGLE_THRESHOLD) { -#endif - - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); - update = true; - } - } - else if(np2) { - int delta = abs((int)MISMATCH_COUNT(result_tab[P2[0]]) - (MAX_SUBSTITUTION + 1)); - - float mapq = 1.0f; -#ifdef USE_MAPQ_SCORE - int delta_corrected = MISMATCH_COUNT(result_tab[P2[0]]) + MAPQ_SCALING_FACTOR * ((MAX_SUBSTITUTION + 1) - delta); - - if(delta_corrected < 0) { - LOG_WARN("negative delta (np2 == 1) for square root %d\n", delta_corrected); - } - else if (delta > DIST_SINGLE_THRESHOLD) { - mapq = 1.0 - sqrt((double)delta_corrected / SIZE_READ); -#else - if (delta > DIST_SINGLE_THRESHOLD) { -#endif - hasIndel |= update_frequency_table(ref_genome, result_tab, reads_buffer, reads_quality_buffer, P2[0], mapq, size_neighbour_in_symbols); - update = true; - } - } - if(!update) { - pthread_mutex_lock(&nr_reads_mutex); - nr_reads_non_mapped++; - pthread_mutex_unlock(&nr_reads_mutex); - add_to_non_mapped_read(numpair * 4, round, fpe1, fpe2, reads_buffer); - } - if(hasIndel) - nr_reads_with_indels++; - pthread_mutex_lock(&nr_reads_mutex); - nr_reads_total_from_dpus++; - pthread_mutex_unlock(&nr_reads_mutex); - } - } -} - -#define PROCESS_READ_THREAD (8) -#define PROCESS_READ_THREAD_SLAVE (PROCESS_READ_THREAD - 1) -static process_read_arg_t args; -static pthread_t thread_id[PROCESS_READ_THREAD_SLAVE]; -static bool stop_threads = false; - -void process_read(FILE *fpe1, FILE *fpe2, int round, unsigned int pass_id) -{ - int8_t *reads_buffer = get_reads_buffer(pass_id); - float *reads_quality_buffer = get_reads_quality_buffer(pass_id); - acc_results_t acc_res = accumulate_get_result(pass_id); - nr_reads_total += get_reads_in_buffer(pass_id) / 4; - - curr_match = 0; - - args.nb_match = acc_res.nb_res; - args.result_tab = acc_res.results; - args.round = round; - args.reads_buffer = reads_buffer; - args.reads_quality_buffer = reads_quality_buffer; - args.fpe1 = fpe1; - args.fpe2 = fpe2; - - pthread_barrier_wait(&barrier); - do_process_read(&args); - pthread_barrier_wait(&barrier); - - free(acc_res.results); -} - -static void *process_read_thread_fct(void *arg) -{ - pthread_barrier_wait(&barrier); - while (!stop_threads) { - do_process_read(arg); - pthread_barrier_wait(&barrier); - pthread_barrier_wait(&barrier); - } - return NULL; -} - -void process_read_init() -{ -#if DEBUG_READ_MAPPING - open_mapping_file(); -#endif - genome_t *ref_genome = genome_get(); - args.ref_genome = ref_genome; - - assert(pthread_mutex_init(&curr_match_mutex, NULL) == 0); - assert(pthread_mutex_init(&non_mapped_mutex, NULL) == 0); - assert(pthread_mutex_init(&freq_table_mutex, NULL) == 0); - assert(pthread_barrier_init(&barrier, NULL, PROCESS_READ_THREAD) == 0); - - for (unsigned int each_thread = 0; each_thread < PROCESS_READ_THREAD_SLAVE; each_thread++) { - assert(pthread_create(&thread_id[each_thread], NULL, process_read_thread_fct, &args) == 0); - } -} - -void process_read_free() -{ -#if DEBUG_READ_MAPPING - close_mapping_file(); -#endif - stop_threads = true; - pthread_barrier_wait(&barrier); - - for (unsigned int each_thread = 0; each_thread < PROCESS_READ_THREAD_SLAVE; each_thread++) { - assert(pthread_join(thread_id[each_thread], NULL) == 0); - } - - assert(pthread_barrier_destroy(&barrier) == 0); - assert(pthread_mutex_destroy(&curr_match_mutex) == 0); - assert(pthread_mutex_destroy(&non_mapped_mutex) == 0); - assert(pthread_mutex_destroy(&freq_table_mutex) == 0); - fflush(stdout); - fprintf(stderr, "%% reads non mapped: %f%%\n", (float)nr_reads_non_mapped * 100.0 / (float)nr_reads_total_from_dpus); - fprintf(stderr, "%% reads with indels: %f%%\n", (float)nr_reads_with_indels * 100.0 / (float)(nr_reads_total_from_dpus - nr_reads_non_mapped)); - fprintf(stderr, "%% Total reads from dpus: %ld%%\n", nr_reads_total_from_dpus); - fprintf(stderr, "%% Total reads: %ld%%\n", nr_reads_total); -} From 00f8a001b6de3fb4e8b3aa36732473c69b10dcf3 Mon Sep 17 00:00:00 2001 From: amoisson Date: Mon, 30 May 2022 10:28:26 +0200 Subject: [PATCH 42/48] made filters adapt to more read sizes --- host/src/vartree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/host/src/vartree.c b/host/src/vartree.c index 40d21eb..a7cdbc1 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -180,7 +180,7 @@ depth_filter_t indel_filter[] = { [10] = { 1, 30 }, [11] = { 1, 40 }, }; -#elif (SIZE_READ == 150) || (SIZE_READ==148) +#elif (SIZE_READ <= 160) && (SIZE_READ>=140) depth_filter_t sub_filter[] = { [3] = { 15, 16 }, [4] = { 17, 20 }, From 12ef73c9efa4e67e3495b744ea865b252a0f6a39 Mon Sep 17 00:00:00 2001 From: amoisson Date: Mon, 30 May 2022 13:50:30 +0200 Subject: [PATCH 43/48] removed unused code and some deprecated TODOs --- host/inc/debug.h | 2 +- host/src/genome.c | 7 ---- host/src/mapping_file.c | 6 --- host/src/parse_args.c | 1 - host/src/processread.c | 92 ----------------------------------------- host/src/vartree.c | 3 +- 6 files changed, 2 insertions(+), 109 deletions(-) diff --git a/host/inc/debug.h b/host/inc/debug.h index 6bf1dde..06dc925 100644 --- a/host/inc/debug.h +++ b/host/inc/debug.h @@ -10,7 +10,7 @@ #define V_DEBUG 5 #define V_TRACE 6 -#define VERBOSE V_INFO +#define VERBOSE V_WARN #define VERBOSE_COLORS true #define VERBOSE_LOG_LEVEL true #define VERBOSE_TIMESTAMP true diff --git a/host/src/genome.c b/host/src/genome.c index d98c056..aaa811e 100644 --- a/host/src/genome.c +++ b/host/src/genome.c @@ -129,7 +129,6 @@ static struct frequency_info* frequency_table[5]; static struct variants_codependence_info_list** variant_codependences; #define NB_CODEPENDENCE_CHUNK_DIV (1<<10) struct codependence_chunk* last_allocated_codependence_chunk[NB_CODEPENDENCE_CHUNK_DIV]; -unsigned int allocated_chunks = 0;//for debug purposes (TODO: remove) static bool init_frequency_table = false; static bool init_codependence_table = false; @@ -143,11 +142,6 @@ struct codependence_chunk { static void allocate_new_codependence_chunk(int i) { STAT_RECORD_START(STAT_ALLOCATE_NEW_CHUNK); - // LOG_INFO("size of pointer: %d\n", (int) sizeof(void*)); - // LOG_INFO("size of co-info: %d\n", (int) sizeof(struct variants_codependence_info)); - // LOG_INFO("size of co-info-list: %d\n", (int) sizeof(struct variants_codependence_info_list)); - LOG_INFO("size of chunk: %d\n", (int) sizeof(struct codependence_chunk)); - LOG_INFO("allocating new codependence chunk (%d)\n", ++allocated_chunks); struct codependence_chunk* new_chunk = calloc(1, sizeof(struct codependence_chunk)); assert(new_chunk != NULL); new_chunk->previous_chunk = last_allocated_codependence_chunk[i]; @@ -192,7 +186,6 @@ void add_codependence_info(struct variants_codependence_info_list** next_variant } } } - LOG_DEBUG("getting new codependence_info for : ? -> %010d (char:%d)\n", other_index_delta, other_letter); int chunk_div_index = (other_index_delta*NB_CODEPENDENCE_CHUNK_DIV)/genome_size; STAT_RECORD_STEP(STAT_ADD_CODEPENDENCE_INFO, 2); pthread_mutex_lock(mutex); diff --git a/host/src/mapping_file.c b/host/src/mapping_file.c index 9345273..8e33737 100644 --- a/host/src/mapping_file.c +++ b/host/src/mapping_file.c @@ -37,7 +37,6 @@ static char *get_mapping_filename() void open_mapping_file() { LOG_DEBUG("opening mapping file\n"); - // TODO: check for memory leaks here char *filename = get_mapping_filename(); mapping_file = fopen(filename, "w"); if (mapping_file == NULL) @@ -45,11 +44,6 @@ void open_mapping_file() LOG_FATAL("couldn't open mapping file; errno : %u\n", errno); } LOG_DEBUG("openned mapping file : %p\n", mapping_file); - // TODO: complete header - LOG_TRACE("writing mapping header\n"); - //fprintf(mapping_file, "@HD VN:" MAP_VERSION " SO:unknown\n"); - //fprintf(mapping_file, "@PG ID:1 PN:" PROGRAM_NAME "\n"); - //LOG_DEBUG("mapping header written\n"); } void write_read_mapping_from_backtrack(char *chromosome_name, uint64_t genome_pos, backtrack_t *backtrack_end, int8_t *read, int read_id) diff --git a/host/src/parse_args.c b/host/src/parse_args.c index cdf7b7b..fef1b51 100644 --- a/host/src/parse_args.c +++ b/host/src/parse_args.c @@ -166,7 +166,6 @@ unsigned int get_nb_thread_for_simu() { return nb_thread_for_simu; } /**************************************************************************************/ /**************************************************************************************/ -//TODO: validate use_freq_table static void validate_use_frequency_table() { use_freq_table = true; } diff --git a/host/src/processread.c b/host/src/processread.c index e0fced6..fc7e15a 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -42,15 +42,6 @@ #define MAX_SCORE_DIFFERENCE_WITH_BEST 40 #define MAX_CONSIDERED_MAPPINGS 4000 -/* -static void log_nucleotides(int8_t *s, int max_len) { - char nucleotides[4] = {'A', 'C', 'T', 'G'}; - for (int i=0; idepth = 1; - newvar->score = result_match.score; - newvar->next = NULL; - if (code_result == CODE_SUB) { - /* SNP = 0,1,2,3 (code A,C,T,G) */ - int snp = code_result_tab[code_result_idx + 2]; - newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3]; - newvar->alt[alt_pos++] = nucleotide[snp & 3]; - - code_result_idx += 3; - } else if (code_result == CODE_INS) { - int64_t ps_var_genome = pos_variant_genome; - int64_t ps_var_read = pos_variant_read; - code_result_idx += 2; - - while (code_result_tab[code_result_idx] < 4) { - ps_var_read++; - code_result_idx++; - } - - while (ref_genome->data[ps_var_genome] == read[ps_var_read]) { - ps_var_genome--; - ps_var_read--; - pos_variant_genome--; - pos_variant_read--; - } - - newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3]; - - while (pos_variant_read <= ps_var_read) { - newvar->alt[alt_pos++] = nucleotide[read[pos_variant_read] & 3]; - if (alt_pos >= MAX_SIZE_ALLELE - 1) { - free(newvar); - return; - } - pos_variant_read++; - } - - } else if (code_result == CODE_DEL) { - int64_t ps_var_genome = pos_variant_genome; - int64_t ps_var_read = pos_variant_read; - code_result_idx += 2; - - while (code_result_tab[code_result_idx] < 4) { - ps_var_genome++; - code_result_idx++; - } - - while (ref_genome->data[ps_var_genome] == read[ps_var_read]) { - ps_var_read--; - ps_var_genome--; - pos_variant_genome--; - pos_variant_read--; - } - - newvar->alt[alt_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3]; - - while (pos_variant_genome <= ps_var_genome) { - newvar->ref[ref_pos++] = nucleotide[ref_genome->data[pos_variant_genome] & 3]; - if (ref_pos >= MAX_SIZE_ALLELE - 1) { - free(newvar); - return; - } - pos_variant_genome++; - } - pos_variant_genome -= ref_pos; - } - newvar->ref[ref_pos] = '\0'; - newvar->alt[alt_pos] = '\0'; - variant_tree_insert( - newvar, result_match.coord.seq_nr, pos_variant_genome + 1 - ref_genome->pt_seq[result_match.coord.seq_nr]); - } - #endif } static pthread_mutex_t non_mapped_mutex; diff --git a/host/src/vartree.c b/host/src/vartree.c index a7cdbc1..bd4dc10 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -394,7 +394,7 @@ FILE * sub_file = NULL; static void get_most_frequent_variant(genome_t * ref_genome, struct frequency_info ** frequency_table, uint32_t seq_number, uint64_t seq_position, variant_t * results) { - static char nucleotide[4] = { 'A', 'C', 'T', 'G' };// FIXME : const + const char nucleotide[4] = { 'A', 'C', 'T', 'G' }; uint64_t genome_pos = ref_genome->pt_seq[seq_number] + seq_position; @@ -450,7 +450,6 @@ static void add_codependence_to_freq_table(struct frequency_info** frequency_tab #define POSITIVE_COD_INFLUENCE 0.0212905 #define NEGATIVE_COD_INFLUENCE -0.580356 -//TODO here read frequency table and write vcf (take max of frequency table to find substitution if any) void create_vcf() { double start_time = my_clock(); From 4b85cb6fee0fa54a071e276f6a4eb97f1d99fa86 Mon Sep 17 00:00:00 2001 From: amoisson Date: Tue, 31 May 2022 15:22:10 +0200 Subject: [PATCH 44/48] made profiling optionnal and removed some useless code --- host/inc/profiling.h | 28 +++++++++++++++++++++------- host/src/processread.c | 9 +-------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/host/inc/profiling.h b/host/inc/profiling.h index e30b305..8028c6a 100644 --- a/host/inc/profiling.h +++ b/host/inc/profiling.h @@ -3,6 +3,7 @@ #include +#define PROFILE_STAT false #define STAT_MAX_SUBSTEPS 10 struct time_stat_t { @@ -32,6 +33,16 @@ struct time_stat_t profiling[25]; #define STAT_GET_NEW_CODEPENDENCE_INFO 16 #define STAT_ALLOCATE_NEW_CHUNK 17 +#define PRINT_MICROSECONDS(t) \ + if (tjx]; if (read_letter > 3) { read_letter = read_letter>>1 & 0x3; From f546d33c7ce45e5d3fbd10616aa98315ae728163 Mon Sep 17 00:00:00 2001 From: amoisson Date: Wed, 20 Jul 2022 11:27:44 +0200 Subject: [PATCH 45/48] fixed no freq table issues --- host/inc/vartree.h | 4 ++-- host/src/processread.c | 18 +++++++++++--- host/src/vartree.c | 54 ++++++++++++++++++++++-------------------- 3 files changed, 45 insertions(+), 31 deletions(-) diff --git a/host/inc/vartree.h b/host/inc/vartree.h index 5276070..bb4feaa 100644 --- a/host/inc/vartree.h +++ b/host/inc/vartree.h @@ -12,8 +12,8 @@ typedef struct variant { uint32_t score; uint32_t depth; - char ref[MAX_SIZE_ALLELE]; - char alt[MAX_SIZE_ALLELE]; + char ref[MAX_SIZE_ALLELE+1]; + char alt[MAX_SIZE_ALLELE+1]; struct variant *next; } variant_t; diff --git a/host/src/processread.c b/host/src/processread.c index 99ef710..c78f463 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -573,6 +573,10 @@ static void set_variant(dpu_result_out_t result_match, genome_t *ref_genome, int for (int i = 0; i < SIZE_READ; i++) { ref_genome->mapping_coverage[genome_pos + i] += 1; } + if (result_match.coord.seq_nr > ref_genome->nb_seq) { + LOG_WARN("skipping a read on a bogus sequence: %d\n", result_match.coord.seq_nr); + return; + } for (; backtrack_end > backtrack; backtrack_end--) { unsigned int current_position = genome_pos+backtrack_end->ix; @@ -605,19 +609,22 @@ static void set_variant(dpu_result_out_t result_match, genome_t *ref_genome, int newvar->ref[ref_idx++] = nucleotide[ref_genome->data[genome_pos+backtrack_end->ix] & 0x3]; newvar->alt[alt_idx++] = nucleotide[read[backtrack_end->jx] & 0x3]; if (alt_idx >= MAX_SIZE_ALLELE || ref_idx >= MAX_SIZE_ALLELE) { - LOG_WARN("ignored read because of a too complex variant\n") + LOG_WARN("clipped a variant because it was too complex") + break; } break; case CODE_INS: newvar->alt[alt_idx++] = nucleotide[read[backtrack_end->jx] & 0x3]; if (alt_idx >= MAX_SIZE_ALLELE) { - LOG_WARN("ignored read because of a too complex variant\n") + LOG_WARN("clipped a variant because it was too complex") + break; } break; case CODE_DEL: newvar->ref[ref_idx++] = nucleotide[ref_genome->data[genome_pos+backtrack_end->ix] & 0x3]; if (ref_idx >= MAX_SIZE_ALLELE) { - LOG_WARN("ignored read because of a too complex variant\n") + LOG_WARN("clipped a variant because it was too complex") + break; } break; } @@ -725,6 +732,11 @@ static void do_process_read(process_read_arg_t *arg) considered_mappings[t][nb_considered_mappings[t]++] = x; } } + for (int l=0; l<4; l++) { + if (nb_considered_mappings[l] > 4000) { + LOG_WARN("nb_considered_mappings[%d] = %u\n", l, nb_considered_mappings[l]); + } + } // i = start index in result_tab // j = stop index in result_tab diff --git a/host/src/vartree.c b/host/src/vartree.c index bd4dc10..bfb54dc 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -382,8 +382,8 @@ __attribute__((unused)) uint32_t depth_filter_fixed_3_f15(float freq) { return 3; } -#define AFFINE_B 1.98142 -#define AFFINE_A 0.164761 +#define AFFINE_B 2.9795 +#define AFFINE_A 0.152036 float reverse_filter(uint32_t score) { return AFFINE_A*(float)score + AFFINE_B; @@ -447,8 +447,8 @@ static void add_codependence_to_freq_table(struct frequency_info** frequency_tab } } -#define POSITIVE_COD_INFLUENCE 0.0212905 -#define NEGATIVE_COD_INFLUENCE -0.580356 +#define POSITIVE_COD_INFLUENCE 0.0103517 +#define NEGATIVE_COD_INFLUENCE -0.80034 void create_vcf() { @@ -613,6 +613,30 @@ void create_vcf() } } } + + unsigned long total_nucleotides = overly_covered_nucleotides+well_covered_nucleotides+badly_covered_nucleotides+uncovered_nucleotides; + printf("\tuncovered nucleotides: %lu (%lu.%lu%%)\n", + (long)uncovered_nucleotides, + (long)uncovered_nucleotides*100/total_nucleotides, + (long)uncovered_nucleotides*10000/total_nucleotides%100); + printf("\tbadly covered nucleotides (less than 10 reads): %lu (%lu.%lu%%)\n", + (long)badly_covered_nucleotides, + (long)badly_covered_nucleotides*100/total_nucleotides, + (long)badly_covered_nucleotides*10000/total_nucleotides%100); + printf("\twell covered nucleotides (10 to 90 reads): %lu (%lu.%lu%%)\n", + (long)well_covered_nucleotides, + (long)well_covered_nucleotides*100/total_nucleotides, + (long)well_covered_nucleotides*10000/total_nucleotides%100); + printf("\toverly covered nucleotides (more than 90 reads): %lu (%lu.%lu%%)\n", + (long)overly_covered_nucleotides, + (long)overly_covered_nucleotides*100/total_nucleotides, + (long)overly_covered_nucleotides*10000/total_nucleotides%100); + printf("\tmax coverage: %u reads\n", max_coverage); + printf("\tmax coverage position: chr%u:%u\n", chromosome_most_coverage, position_most_coverage); + printf("\ttotal coverage: %lu (eq %lu reads; or %lux coverage)\n", total_coverage, (long)total_coverage/SIZE_READ, (long)total_coverage/total_nucleotides); + double mean = ((double)total_coverage) / (double) total_nucleotides; + printf("\tmean cov: %f (std dev: %f)\n", mean, sqrt((double)total_cov_squared/(double)total_nucleotides - mean*mean)); + } else { // Using var-tree and not freq-table LOG_INFO("doing first and only pass of vc\n"); for (uint32_t seq_number = 0; seq_number < ref_genome->nb_seq; seq_number++) { @@ -631,28 +655,6 @@ void create_vcf() free_frequency_table(); fclose(vcf_file); - unsigned long total_nucleotides = overly_covered_nucleotides+well_covered_nucleotides+badly_covered_nucleotides+uncovered_nucleotides; - printf("\tuncovered nucleotides: %lu (%lu.%lu%%)\n", - (long)uncovered_nucleotides, - (long)uncovered_nucleotides*100/total_nucleotides, - (long)uncovered_nucleotides*10000/total_nucleotides%100); - printf("\tbadly covered nucleotides (less than 10 reads): %lu (%lu.%lu%%)\n", - (long)badly_covered_nucleotides, - (long)badly_covered_nucleotides*100/total_nucleotides, - (long)badly_covered_nucleotides*10000/total_nucleotides%100); - printf("\twell covered nucleotides (10 to 90 reads): %lu (%lu.%lu%%)\n", - (long)well_covered_nucleotides, - (long)well_covered_nucleotides*100/total_nucleotides, - (long)well_covered_nucleotides*10000/total_nucleotides%100); - printf("\toverly covered nucleotides (more than 90 reads): %lu (%lu.%lu%%)\n", - (long)overly_covered_nucleotides, - (long)overly_covered_nucleotides*100/total_nucleotides, - (long)overly_covered_nucleotides*10000/total_nucleotides%100); - printf("\tmax coverage: %u reads\n", max_coverage); - printf("\tmax coverage position: chr%u:%u\n", chromosome_most_coverage, position_most_coverage); - printf("\ttotal coverage: %lu (eq %lu reads; or %lux coverage)\n", total_coverage, (long)total_coverage/SIZE_READ, (long)total_coverage/total_nucleotides); - double mean = ((double)total_coverage) / (double) total_nucleotides; - printf("\tmean cov: %f (std dev: %f)\n", mean, sqrt((double)total_cov_squared/(double)total_nucleotides - mean*mean)); printf("\tnumber of variants: %d (multiple %d)\n", nb_variant, nb_pos_multiple_var); printf("\ttime: %lf s\n", my_clock() - start_time); fflush(stdout); From 53e8cd2881df2bd0a90fa2aaedaf36c72ccb509f Mon Sep 17 00:00:00 2001 From: amoisson Date: Wed, 20 Jul 2022 14:05:34 +0200 Subject: [PATCH 46/48] added debug read mapping to no-freq-table --- host/src/processread.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/host/src/processread.c b/host/src/processread.c index c78f463..f781e0f 100644 --- a/host/src/processread.c +++ b/host/src/processread.c @@ -569,6 +569,10 @@ static void set_variant(dpu_result_out_t result_match, genome_t *ref_genome, int if (backtrack_end->type == CODE_ERR) return; + #if DEBUG_READ_MAPPING + write_read_mapping_from_backtrack(ref_genome->seq_name[result_match.coord.seq_nr], result_match.coord.seed_nr, backtrack_end, read, result_match.num); + #endif + /* Update "mapping_coverage" with the number of reads that match at this position of the genome */ for (int i = 0; i < SIZE_READ; i++) { ref_genome->mapping_coverage[genome_pos + i] += 1; @@ -606,29 +610,30 @@ static void set_variant(dpu_result_out_t result_match, genome_t *ref_genome, int for (;backtrack_end->type != 0 && backtrack_end->type != CODE_END; backtrack_end--) { switch (backtrack_end->type) { case CODE_SUB: - newvar->ref[ref_idx++] = nucleotide[ref_genome->data[genome_pos+backtrack_end->ix] & 0x3]; - newvar->alt[alt_idx++] = nucleotide[read[backtrack_end->jx] & 0x3]; if (alt_idx >= MAX_SIZE_ALLELE || ref_idx >= MAX_SIZE_ALLELE) { LOG_WARN("clipped a variant because it was too complex") - break; + goto insert_variant; } + newvar->ref[ref_idx++] = nucleotide[ref_genome->data[genome_pos+backtrack_end->ix] & 0x3]; + newvar->alt[alt_idx++] = nucleotide[read[backtrack_end->jx] & 0x3]; break; case CODE_INS: - newvar->alt[alt_idx++] = nucleotide[read[backtrack_end->jx] & 0x3]; if (alt_idx >= MAX_SIZE_ALLELE) { LOG_WARN("clipped a variant because it was too complex") - break; + goto insert_variant; } + newvar->alt[alt_idx++] = nucleotide[read[backtrack_end->jx] & 0x3]; break; case CODE_DEL: - newvar->ref[ref_idx++] = nucleotide[ref_genome->data[genome_pos+backtrack_end->ix] & 0x3]; if (ref_idx >= MAX_SIZE_ALLELE) { LOG_WARN("clipped a variant because it was too complex") - break; + goto insert_variant; } + newvar->ref[ref_idx++] = nucleotide[ref_genome->data[genome_pos+backtrack_end->ix] & 0x3]; break; } } + insert_variant: newvar->ref[ref_idx] = '\0'; newvar->alt[alt_idx] = '\0'; variant_tree_insert( From 153d013003df5c21c35c23585b8aa651b96d4d3a Mon Sep 17 00:00:00 2001 From: amoisson Date: Fri, 9 Sep 2022 15:03:36 +0200 Subject: [PATCH 47/48] variant tree only allocated if frequency table not used --- host/src/upvc.c | 8 ++++++-- host/src/vartree.c | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/host/src/upvc.c b/host/src/upvc.c index d711339..5ce79fd 100644 --- a/host/src/upvc.c +++ b/host/src/upvc.c @@ -313,7 +313,9 @@ static void do_mapping() { STAT_RECORD_START(STAT_DO_MAPPING); backends_functions.init_backend(&nb_dpus_per_run); - variant_tree_init(); + if (!get_use_frequency_table()) { + variant_tree_init(); + } dispatch_init(); process_read_init(); STAT_RECORD_STEP(STAT_DO_MAPPING, 0); @@ -331,7 +333,9 @@ static void do_mapping() process_read_free(); dispatch_free(); - variant_tree_free(); + if (!get_use_frequency_table()) { + variant_tree_free(); + } backends_functions.free_backend(); STAT_RECORD_LAST_STEP(STAT_DO_MAPPING, 3); } diff --git a/host/src/vartree.c b/host/src/vartree.c index bfb54dc..5ba08c6 100644 --- a/host/src/vartree.c +++ b/host/src/vartree.c @@ -99,7 +99,7 @@ void variant_tree_init() genome_t *genome = genome_get(); pthread_mutex_init(&mutex, NULL); for (unsigned int each_seq = 0; each_seq < genome->nb_seq; each_seq++) { - LOG_INFO("allocating variant_list (%luMB)\n", sizeof(variant_t*) * genome->len_seq[each_seq]); + LOG_INFO("allocating variant_list (%luMB)\n", sizeof(variant_t*) * genome->len_seq[each_seq]/1000000); variant_list[each_seq] = (variant_t **)calloc(genome->len_seq[each_seq], sizeof(variant_t *)); } } From 9b06ee07740ba8cafa9147d2718a1886677abbb6 Mon Sep 17 00:00:00 2001 From: amoisson Date: Fri, 9 Sep 2022 15:36:26 +0200 Subject: [PATCH 48/48] added INFO logs for more allocs --- host/src/accumulateread.c | 3 +++ host/src/genome.c | 1 + host/src/getread.c | 2 ++ 3 files changed, 6 insertions(+) diff --git a/host/src/accumulateread.c b/host/src/accumulateread.c index 6748aa5..95b401c 100644 --- a/host/src/accumulateread.c +++ b/host/src/accumulateread.c @@ -177,6 +177,7 @@ acc_results_t accumulate_get_result(unsigned int pass_id, bool free_results) size_t size = ftell(result_file[pass_id]); rewind(result_file[pass_id]); + LOG_INFO("allocating %lu to accumulate results\n", size); dpu_result_out_t *results = (dpu_result_out_t *)malloc(size); assert(results != NULL); size_t size_read = fread(results, size, 1, result_file[pass_id]); @@ -213,6 +214,7 @@ void accumulate_read(unsigned int pass_id, unsigned int dpu_offset) return; } + LOG_INFO("allocating %lu for bucket_elems\n", sizeof(bucket_elem_t) * total_nb_res); bucket_elems = (bucket_elem_t *)malloc(sizeof(bucket_elem_t) * total_nb_res); assert(bucket_elems != NULL); @@ -298,6 +300,7 @@ void accumulate_init(unsigned int max_nb_pass) result_file = (FILE **)calloc(nb_pass, sizeof(FILE *)); assert(result_file != NULL); + LOG_INFO("allocating %lu for results_buffers\n", sizeof(acc_results_t) * nb_dpus_per_run * NB_DISPATCH_AND_ACC_BUFFER); for (unsigned int each_pass = 0; each_pass < NB_DISPATCH_AND_ACC_BUFFER; each_pass++) { results_buffers[each_pass] = (acc_results_t *)malloc(sizeof(acc_results_t) * nb_dpus_per_run); assert(results_buffers[each_pass] != NULL); diff --git a/host/src/genome.c b/host/src/genome.c index aaa811e..157f915 100644 --- a/host/src/genome.c +++ b/host/src/genome.c @@ -142,6 +142,7 @@ struct codependence_chunk { static void allocate_new_codependence_chunk(int i) { STAT_RECORD_START(STAT_ALLOCATE_NEW_CHUNK); + LOG_INFO("allocating codependence_chunk (%lu)\n", sizeof(struct codependence_chunk)); struct codependence_chunk* new_chunk = calloc(1, sizeof(struct codependence_chunk)); assert(new_chunk != NULL); new_chunk->previous_chunk = last_allocated_codependence_chunk[i]; diff --git a/host/src/getread.c b/host/src/getread.c index 27e3249..6e06645 100644 --- a/host/src/getread.c +++ b/host/src/getread.c @@ -11,6 +11,7 @@ #include #include "common.h" +#include "debug.h" #include "getread.h" #include "upvc.h" @@ -104,6 +105,7 @@ void get_reads(FILE *fpe1, FILE *fpe2, unsigned int pass_id) int8_t *reads_buffer = reads_buffers[pass_id]; float* reads_quality_buffer = reads_quality_buffers[pass_id]; if (reads_buffer == NULL) { + LOG_INFO("allocating %lu for reads_buffer\n", MAX_READS_BUFFER*SIZE_READ*(1+sizeof(float))); reads_buffer = (int8_t *)malloc(MAX_READS_BUFFER * SIZE_READ); reads_quality_buffer = (float *)malloc(MAX_READS_BUFFER/2 * SIZE_READ * sizeof(float)); assert(reads_buffer != NULL);