/* * The MIT License * * Wavefront Alignment Algorithms * Copyright (c) 2017 by Santiago Marco-Sola * * This file is part of Wavefront Alignment Algorithms. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * PROJECT: Wavefront Alignment Algorithms * AUTHOR(S): Santiago Marco-Sola * DESCRIPTION: Packed CIGAR (Alignment operations in 2-bits) */ #include "utils/commons.h" #include "system/mm_allocator.h" #include "wavefront_pcigar.h" /* * Packed CIGAR operations */ typedef struct { char operation; int inc_v; int inc_h; affine_matrix_type matrix_type; } pcigar_op_t; pcigar_op_t pcigar_lut[4] = { { .operation = '?', .inc_v = 0, .inc_h = 0, .matrix_type = affine_matrix_M }, // 00 - None { .operation = 'D', .inc_v = 1, .inc_h = 0, .matrix_type = affine_matrix_D }, // 01 - DELETION { .operation = 'X', .inc_v = 1, .inc_h = 1, .matrix_type = affine_matrix_M }, // 10 - MISMATCH { .operation = 'I', .inc_v = 0, .inc_h = 1, .matrix_type = affine_matrix_I }, // 11 - INSERTION }; // Precomputed string of Matches char matches_lut[8] = "MMMMMMMM"; #define CIGAR_8MATCHES_UINT64 *((uint64_t*)matches_lut) /* * Accessors */ int pcigar_get_length( const pcigar_t pcigar) { int cigar_length = PCIGAR_MAX_LENGTH; if (!PCIGAR_IS_UTILISED(pcigar,PCIGAR_FULL_MASK)) { const int free_slots = PCIGAR_FREE_SLOTS(pcigar); cigar_length -= free_slots; } return cigar_length; } int pcigar_unpack( pcigar_t pcigar, char* cigar_buffer) { // Compute pcigar length and shift to the end of the word int pcigar_length = PCIGAR_MAX_LENGTH; if (!PCIGAR_IS_UTILISED(pcigar,PCIGAR_FULL_MASK)) { const int free_slots = PCIGAR_FREE_SLOTS(pcigar); pcigar_length -= free_slots; pcigar <<= free_slots*2; } // Unpack BT-blocks int i; for (i=0;ioperation; } return pcigar_length; } /* * PCIGAR extend exact-matches */ int pcigar_unpack_extend( const char* const pattern, const int pattern_length, const char* const text, const int text_length, int v, int h, char* cigar_buffer) { int num_matches = 0; // Fetch pattern/text blocks uint64_t* pattern_blocks = (uint64_t*)(pattern+v); uint64_t* text_blocks = (uint64_t*)(text+h); uint64_t pattern_block = *pattern_blocks; uint64_t text_block = *text_blocks; // Compare 64-bits blocks uint64_t cmp = pattern_block ^ text_block; while (cmp==0 && (v+8) < pattern_length && (h+8) < text_length) { // Increment offset v += 8; h += 8; num_matches += 8; // Dump matches in block *((uint64_t*)cigar_buffer) = CIGAR_8MATCHES_UINT64; cigar_buffer += 8; // Next blocks ++pattern_blocks; ++text_blocks; // Fetch & Compare pattern_block = *pattern_blocks; text_block = *text_blocks; cmp = pattern_block ^ text_block; } // Count equal characters num_matches += __builtin_ctzl(cmp)/8; *((uint64_t*)cigar_buffer) = CIGAR_8MATCHES_UINT64; // Return total matches return num_matches; } int pcigar_unpack_extend_custom( const int pattern_length, const int text_length, alignment_match_funct_t const match_funct, void* const match_funct_arguments, int v, int h, char* cigar_buffer) { int num_matches = 0; while (v < pattern_length && h < text_length) { // Check match if (!match_funct(v,h,match_funct_arguments)) break; ++v; ++h; // Increment matches *cigar_buffer = 'M'; ++cigar_buffer; ++num_matches; } return num_matches; } /* * PCIGAR unpack */ void pcigar_unpack_linear( pcigar_t pcigar, wavefront_sequences_t* const sequences, int* const v_pos, int* const h_pos, char* cigar_buffer, int* const cigar_length) { // Parameters char* const pattern = sequences->pattern; const int pattern_length = sequences->pattern_length; char* const text = sequences->text; const int text_length = sequences->text_length; char* const cigar_buffer_base = cigar_buffer; // Compute pcigar length and shift to the end of the word int pcigar_length = PCIGAR_MAX_LENGTH; if (!PCIGAR_IS_UTILISED(pcigar,PCIGAR_FULL_MASK)) { const int free_slots = PCIGAR_FREE_SLOTS(pcigar); pcigar_length -= free_slots; pcigar <<= free_slots*2; } // Unpack BT-blocks int v = *v_pos, h = *h_pos, i; for (i=0;imode == wf_sequences_lambda) { // Custom extend-match function num_matches = pcigar_unpack_extend_custom( pattern_length,text_length,sequences->match_funct, sequences->match_funct_arguments,v,h,cigar_buffer); } else { num_matches = pcigar_unpack_extend( pattern,pattern_length,text,text_length,v,h,cigar_buffer); } // Update location v += num_matches; h += num_matches; cigar_buffer += num_matches; // Extract next CIGAR operation const int cigar_op = (int)PCIGAR_EXTRACT(pcigar); // Extract PCIGAR_POP_FRONT(pcigar); // Shift // Add operation using LUT pcigar_op_t* const op = pcigar_lut + cigar_op; *(cigar_buffer++) = op->operation; v += op->inc_v; h += op->inc_h; } // Update length/positions *cigar_length = cigar_buffer - cigar_buffer_base; *v_pos = v; *h_pos = h; } void pcigar_unpack_affine( pcigar_t pcigar, wavefront_sequences_t* const sequences, int* const v_pos, int* const h_pos, char* cigar_buffer, int* const cigar_length, affine_matrix_type* const current_matrix_type) { // Parameters char* const pattern = sequences->pattern; const int pattern_length = sequences->pattern_length; char* const text = sequences->text; const int text_length = sequences->text_length; char* const cigar_buffer_base = cigar_buffer; // Compute pcigar length and shift to the end of the word int pcigar_length = PCIGAR_MAX_LENGTH; if (!PCIGAR_IS_UTILISED(pcigar,PCIGAR_FULL_MASK)) { const int free_slots = PCIGAR_FREE_SLOTS(pcigar); pcigar_length -= free_slots; pcigar <<= free_slots*2; } // Unpack BT-blocks affine_matrix_type matrix_type = *current_matrix_type; int v = *v_pos, h = *h_pos, i; for (i=0;imode == wf_sequences_lambda) { // Custom extend-match function num_matches = pcigar_unpack_extend_custom( pattern_length,text_length,sequences->match_funct, sequences->match_funct_arguments,v,h,cigar_buffer); } else { num_matches = pcigar_unpack_extend( pattern,pattern_length,text,text_length,v,h,cigar_buffer); } // Update location v += num_matches; h += num_matches; cigar_buffer += num_matches; } // Extract next CIGAR operation const int cigar_op = (int)PCIGAR_EXTRACT(pcigar); // Extract PCIGAR_POP_FRONT(pcigar); // Shift // Add operation using LUT pcigar_op_t* const op = pcigar_lut + cigar_op; // Avoid X after I/D (used to encode gap-close) if (matrix_type != affine_matrix_M && op->operation=='X') { matrix_type = affine_matrix_M; continue; } // Add operation *(cigar_buffer++) = op->operation; v += op->inc_v; h += op->inc_h; matrix_type = op->matrix_type; } // Update length/positions *cigar_length = cigar_buffer - cigar_buffer_base; *v_pos = v; *h_pos = h; *current_matrix_type = matrix_type; } /* * Display */ void pcigar_print( FILE* const stream, pcigar_t pcigar) { char cigar_buffer[64]; const int pcigar_length = pcigar_unpack(pcigar,cigar_buffer); cigar_buffer[pcigar_length] = '\0'; fprintf(stream,"%s",cigar_buffer); }