VariantKey  5.4.1
Numerical Encoding for Human Genetic Variants
genoref.h
Go to the documentation of this file.
1 // VariantKey
2 //
3 // genoref.h
4 //
5 // @category Libraries
6 // @author Nicola Asuni <nicola.asuni@genomicsplc.com>
7 // @copyright 2017-2018 GENOMICS plc
8 // @license MIT (see LICENSE)
9 // @link https://github.com/genomicsplc/variantkey
10 //
11 // LICENSE
12 //
13 // Copyright (c) 2017-2018 GENOMICS plc
14 //
15 // Permission is hereby granted, free of charge, to any person obtaining a copy
16 // of this software and associated documentation files (the "Software"), to deal
17 // in the Software without restriction, including without limitation the rights
18 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19 // copies of the Software, and to permit persons to whom the Software is
20 // furnished to do so, subject to the following conditions:
21 //
22 // The above copyright notice and this permission notice shall be included in
23 // all copies or substantial portions of the Software.
24 //
25 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
31 // THE SOFTWARE.
32 
44 #ifndef VARIANTKEY_GENOREF_H
45 #define VARIANTKEY_GENOREF_H
46 
47 #include <stdio.h>
48 #include <string.h>
49 #include "binsearch.h"
50 #include "variantkey.h"
51 
52 #ifndef ALLELE_MAXSIZE
53 #define ALLELE_MAXSIZE 256
54 #endif
55 
56 // Return codes for normalize_variant()
57 #define NORM_WRONGPOS (-2)
58 #define NORM_INVALID (-1)
59 #define NORM_OK (0)
60 #define NORM_VALID (1)
61 #define NORM_SWAP (1 << 1)
62 #define NORM_FLIP (1 << 2)
63 #define NORM_LEXT (1 << 3)
64 #define NORM_RTRIM (1 << 4)
65 #define NORM_LTRIM (1 << 5)
66 
67 
75 static inline void mmap_genoref_file(const char *file, mmfile_t *mf)
76 {
77  mmap_binfile(file, mf);
78  mf->index[26] = mf->size;
79  int i = 25;
80  while (i > 0)
81  {
82  mf->index[i] = mf->index[(i - 1)];
83  i--;
84  }
85  mf->ncols = 27;
86 }
87 
97 static inline int aztoupper(int c)
98 {
99  if (c >= 'a')
100  {
101  return (c ^ ('a' - 'A'));
102  }
103  return c;
104 }
105 
115 static inline void prepend_char(const uint8_t pre, char *string, size_t *size)
116 {
117  memmove(string + 1, string, (*size + 1));
118  string[0] = pre;
119  (*size)++;
120 }
121 
131 static inline char get_genoref_seq(mmfile_t mf, uint8_t chrom, uint32_t pos)
132 {
133  uint64_t offset = (mf.index[chrom] + pos);
134  if (offset >= mf.index[(chrom + 1)])
135  {
136  return 0; // invalid position
137  }
138  return *(mf.src + offset);
139 }
140 
156 static inline int check_reference(mmfile_t mf, uint8_t chrom, uint32_t pos, const char *ref, size_t sizeref)
157 {
158  uint64_t offset = (mf.index[chrom] + pos);
159  if ((offset + sizeref - 1) >= mf.index[(chrom + 1)])
160  {
161  return NORM_WRONGPOS;
162  }
163  size_t i;
164  char uref, gref;
165  int ret = 0; // return value
166  for (i = 0; i < sizeref; i++)
167  {
168  uref = aztoupper(ref[i]);
169  gref = mf.src[(offset + i)];
170  if (uref == gref)
171  {
172  continue;
173  }
174  /*
175  Abbreviation codes for degenerate bases
176 
177  Cornish-Bowden A.
178  Nomenclature for incompletely specified bases in nucleic acid sequences: recommendations 1984.
179  Nucleic Acids Research. 1985;13(9):3021-3030.
180 
181  SYMBOL | DESCRIPTION | BASES | COMPLEMENT
182  -------+-------------------------------+---------+-----------
183  A | Adenine | A | T
184  C | Cytosine | C | G
185  G | Guanine | G | C
186  T | Thymine | T | A
187  W | Weak | A T | W
188  S | Strong | C G | S
189  M | aMino | A C | K
190  K | Keto | G T | M
191  R | puRine | A G | Y
192  Y | pYrimidine | C T | R
193  B | not A (B comes after A) | C G T | V
194  D | not C (D comes after C) | A G T | H
195  H | not G (H comes after G) | A C T | D
196  V | not T (V comes after T and U) | A C G | B
197  N | aNy base (not a gap) | A C G T | N
198  -------+-------------------------------+---------+----------
199  */
200  if ((uref == 'N')
201  || (gref == 'N')
202  || ((uref == 'B') && (gref != 'A'))
203  || ((gref == 'B') && (uref != 'A'))
204  || ((uref == 'D') && (gref != 'C'))
205  || ((gref == 'D') && (uref != 'C'))
206  || ((uref == 'H') && (gref != 'G'))
207  || ((gref == 'H') && (uref != 'G'))
208  || ((uref == 'V') && (gref != 'T'))
209  || ((gref == 'V') && (uref != 'T'))
210  || ((uref == 'W') && ((gref == 'A') || (gref == 'T')))
211  || ((gref == 'W') && ((uref == 'A') || (uref == 'T')))
212  || ((uref == 'S') && ((gref == 'C') || (gref == 'G')))
213  || ((gref == 'S') && ((uref == 'C') || (uref == 'G')))
214  || ((uref == 'M') && ((gref == 'A') || (gref == 'C')))
215  || ((gref == 'M') && ((uref == 'A') || (uref == 'C')))
216  || ((uref == 'K') && ((gref == 'G') || (gref == 'T')))
217  || ((gref == 'K') && ((uref == 'G') || (uref == 'T')))
218  || ((uref == 'R') && ((gref == 'A') || (gref == 'G')))
219  || ((gref == 'R') && ((uref == 'A') || (uref == 'G')))
220  || ((uref == 'Y') && ((gref == 'C') || (gref == 'T')))
221  || ((gref == 'Y') && ((uref == 'C') || (uref == 'T'))))
222  {
223  ret = NORM_VALID; // valid but not consistent
224  continue;
225  }
226  return NORM_INVALID; // invalid reference
227  }
228  return ret; // sequence OK
229 }
230 
239 static inline void flip_allele(char *allele, size_t size)
240 {
241  /*
242  Byte map for allele flipping (complement):
243 
244  A > T
245  T > A
246  C > G
247  G > C
248  M > K
249  K > M
250  R > Y
251  Y > R
252  B > V
253  V > B
254  D > H
255  H > D
256  */
257  static const char map[] = "00000000000000000000000000000000"
258  "00000000000000000123456789000000"
259  /*ABCDEFGHIJKLMNOPQRSTUVWXYZ*/
260  "0TVGHEFCDIJMLKNOPQYSAUBWXRZ00000"
261  /*abcdefghijklmnopqrstuvwxyz*/
262  "0TVGHEFCDIJMLKNOPQYSAUBWXRZ00000"
263  "00000000000000000000000000000000"
264  "00000000000000000000000000000000"
265  "00000000000000000000000000000000"
266  "00000000000000000000000000000000";
267  size_t i;
268  for (i = 0; i < size; i++)
269  {
270  allele[i] = map[((uint8_t)allele[i])];
271  }
272  allele[size] = 0;
273 }
274 
275 static inline void swap_sizes(size_t *first, size_t *second)
276 {
277  size_t tmp = *first;
278  *first = *second;
279  *second = tmp;
280 }
281 
282 static inline void swap_alleles(char *first, size_t *sizefirst, char *second, size_t *sizesecond)
283 {
284  char tmp[ALLELE_MAXSIZE];
285  strncpy(tmp, first, *sizefirst);
286  strncpy(first, second, *sizesecond);
287  strncpy(second, tmp, *sizefirst);
288  swap_sizes(sizefirst, sizesecond);
289  first[*sizefirst] = 0;
290  second[*sizesecond] = 0;
291 }
292 
315 static inline int normalize_variant(mmfile_t mf, uint8_t chrom, uint32_t *pos, char *ref, size_t *sizeref, char *alt, size_t *sizealt)
316 {
317  char left;
318  char fref[ALLELE_MAXSIZE];
319  char falt[ALLELE_MAXSIZE];
320  int status;
321  status = check_reference(mf, chrom, *pos, ref, *sizeref);
322  if (status == -2)
323  {
324  return status; // invalid position
325  }
326  if (status < 0)
327  {
328  status = check_reference(mf, chrom, *pos, alt, *sizealt);
329  if (status >= 0)
330  {
331  swap_alleles(ref, sizeref, alt, sizealt);
332  status |= NORM_SWAP;
333  }
334  else
335  {
336  strncpy(fref, ref, *sizeref);
337  flip_allele(fref, *sizeref);
338  status = check_reference(mf, chrom, *pos, fref, *sizeref);
339  if (status >= 0)
340  {
341  strncpy(ref, fref, *sizealt);
342  flip_allele(alt, *sizealt);
343  status |= NORM_FLIP;
344  }
345  else
346  {
347  strncpy(falt, alt, *sizealt);
348  flip_allele(falt, *sizealt);
349  status = check_reference(mf, chrom, *pos, falt, *sizealt);
350  if (status >= 0)
351  {
352  strncpy(ref, falt, *sizealt);
353  strncpy(alt, fref, *sizeref);
354  swap_sizes(sizeref, sizealt);
355  status |= NORM_SWAP + NORM_FLIP;
356  }
357  else
358  {
359  return status; // invalid reference
360  }
361  }
362  }
363  }
364  if ((*sizealt == 1) && (*sizeref == 1))
365  {
366  return status; // SNP
367  }
368  while (1)
369  {
370  // left extend
371  if (((*sizealt == 0) || (*sizeref == 0)) && (*pos > 0))
372  {
373  (*pos)--;
374  left = (char)mf.src[(mf.index[chrom] + *pos)];
375  prepend_char(left, alt, sizealt);
376  prepend_char(left, ref, sizeref);
377  status |= NORM_LEXT;
378  }
379  else
380  {
381  // right trim
382  if ((*sizealt > 1) && (*sizeref > 1) && (aztoupper(alt[(*sizealt - 1)]) == aztoupper(ref[(*sizeref - 1)])))
383  {
384  (*sizealt)--;
385  (*sizeref)--;
386  status |= NORM_RTRIM;
387  }
388  else
389  {
390  break;
391  }
392  }
393  }
394  // left trim
395  uint8_t offset = 0;
396  while ((offset < (*sizealt - 1)) && (offset < (*sizeref - 1)) && (aztoupper(alt[offset]) == aztoupper(ref[offset])))
397  {
398  offset++;
399  }
400  if (offset > 0)
401  {
402  *pos += offset;
403  *sizeref -= offset;
404  *sizealt -= offset;
405  memmove(ref, ref + offset, *sizeref);
406  memmove(alt, alt + offset, *sizealt);
407  status |= NORM_LTRIM;
408  }
409  ref[*sizeref] = 0;
410  alt[*sizealt] = 0;
411  return status;
412 }
413 
432 static inline uint64_t normalized_variantkey(mmfile_t mf, const char *chrom, size_t sizechrom, uint32_t *pos, uint8_t posindex, char *ref, size_t *sizeref, char *alt, size_t *sizealt, int *ret)
433 {
434  uint8_t echrom = encode_chrom(chrom, sizechrom);
435  (*pos) -= posindex;
436  *ret = normalize_variant(mf, echrom, pos, ref, sizeref, alt, sizealt);
437  return encode_variantkey(echrom, *pos, encode_refalt(ref, *sizeref, alt, *sizealt));
438 }
439 
440 #endif // VARIANTKEY_GENOREF_H
#define ALLELE_MAXSIZE
Maximum allele length.
Definition: genoref.h:53
static int aztoupper(int c)
Definition: genoref.h:97
#define NORM_FLIP
Normalization: The alleles nucleotides have been flipped (each nucleotide have been replaced with its...
Definition: genoref.h:62
static void prepend_char(const uint8_t pre, char *string, size_t *size)
Definition: genoref.h:115
#define NORM_INVALID
Normalization: Invalid reference.
Definition: genoref.h:58
Definition: binsearch.h:229
static uint64_t encode_variantkey(uint8_t chrom, uint32_t pos, uint32_t refalt)
Returns a 64 bit variant key based on the pre-encoded CHROM, POS (0-based) and REF+ALT.
Definition: variantkey.h:440
static void mmap_genoref_file(const char *file, mmfile_t *mf)
Definition: genoref.h:75
uint8_t * src
Pointer to the memory map.
Definition: binsearch.h:231
static void mmap_binfile(const char *file, mmfile_t *mf)
Definition: binsearch.h:995
static int check_reference(mmfile_t mf, uint8_t chrom, uint32_t pos, const char *ref, size_t sizeref)
Definition: genoref.h:156
#define NORM_LTRIM
Normalization: Alleles have been left trimmed.
Definition: genoref.h:65
static void flip_allele(char *allele, size_t size)
Definition: genoref.h:239
#define NORM_SWAP
Normalization: The alleles have been swapped.
Definition: genoref.h:61
static void swap_sizes(size_t *first, size_t *second)
Definition: genoref.h:275
VariantKey main functions.
uint64_t size
File size in bytes.
Definition: binsearch.h:233
#define NORM_RTRIM
Normalization: Alleles have been right trimmed.
Definition: genoref.h:64
Functions to search values in binary files made of constant-length items.
static uint8_t encode_chrom(const char *chrom, size_t size)
Returns chromosome numerical encoding.
Definition: variantkey.h:86
uint8_t ncols
Number of columns - THIS MUST BE MANUALLY SET EXCEPT FOR THE "BINSRC1" FORMAT.
Definition: binsearch.h:237
#define NORM_LEXT
Normalization: Alleles have been left extended.
Definition: genoref.h:63
uint64_t index[256]
Index of the offsets to the beginning of each column.
Definition: binsearch.h:239
static void swap_alleles(char *first, size_t *sizefirst, char *second, size_t *sizesecond)
Definition: genoref.h:282
#define NORM_VALID
Normalization: The reference allele is inconsistent with the genome reference (i.e. when contains nucleotide letters other than A, C, G and T).
Definition: genoref.h:60
static char get_genoref_seq(mmfile_t mf, uint8_t chrom, uint32_t pos)
Definition: genoref.h:131
static uint64_t normalized_variantkey(mmfile_t mf, const char *chrom, size_t sizechrom, uint32_t *pos, uint8_t posindex, char *ref, size_t *sizeref, char *alt, size_t *sizealt, int *ret)
Returns a normalized 64 bit variant key based on CHROM, POS, REF, ALT.
Definition: genoref.h:432
static uint32_t encode_refalt(const char *ref, size_t sizeref, const char *alt, size_t sizealt)
Returns reference+alternate numerical encoding.
Definition: variantkey.h:317
#define NORM_WRONGPOS
Normalization: Invalid position.
Definition: genoref.h:57
static int normalize_variant(mmfile_t mf, uint8_t chrom, uint32_t *pos, char *ref, size_t *sizeref, char *alt, size_t *sizealt)
Definition: genoref.h:315