VariantKey  5.4.1
Numerical Encoding for Human Genetic Variants
nrvk.h
Go to the documentation of this file.
1 // VariantKey
2 //
3 // nrvk.h
4 //
5 // @category Libraries
6 // @author Nicola Asuni <nicola.asuni@genomicsplc.com>
7 // @copyright 2017-2018 GENOMICS plc
8 // @license MIT (see LICENSE)
9 // @link https://github.com/genomicsplc/variantkey
10 //
11 // LICENSE
12 //
13 // Copyright (c) 2017-2018 GENOMICS plc
14 //
15 // Permission is hereby granted, free of charge, to any person obtaining a copy
16 // of this software and associated documentation files (the "Software"), to deal
17 // in the Software without restriction, including without limitation the rights
18 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19 // copies of the Software, and to permit persons to whom the Software is
20 // furnished to do so, subject to the following conditions:
21 //
22 // The above copyright notice and this permission notice shall be included in
23 // all copies or substantial portions of the Software.
24 //
25 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
31 // THE SOFTWARE.
32 
53 #ifndef VARIANTKEY_NRVK_H
54 #define VARIANTKEY_NRVK_H
55 
56 #include <stdio.h>
57 #include <string.h>
58 #include "binsearch.h"
59 #include "variantkey.h"
60 
61 #ifndef ALLELE_MAXSIZE
62 #define ALLELE_MAXSIZE 256
63 #endif
64 
68 typedef struct variantkey_rev_t
69 {
70  char chrom[3];
71  uint32_t pos;
74  size_t sizeref;
75  size_t sizealt;
77 
81 typedef struct nrvk_cols_t
82 {
83  const uint64_t *vk;
84  const uint64_t *offset;
85  const uint8_t *data;
86  uint64_t nrows;
87 } nrvk_cols_t;
88 
98 static inline void mmap_nrvk_file(const char *file, mmfile_t *mf, nrvk_cols_t *nvc)
99 {
100  mmap_binfile(file, mf);
101  nvc->vk = (const uint64_t *)(mf->src + mf->index[0]);
102  nvc->offset = (const uint64_t *)(mf->src + mf->index[1]);
103  nvc->data = (const uint8_t *)(mf->src + mf->index[2]);
104  nvc->nrows = mf->nrows;
105 }
106 
107 static inline size_t get_nrvk_ref_alt_by_pos(nrvk_cols_t nvc, uint64_t pos, char *ref, size_t *sizeref, char *alt, size_t *sizealt)
108 {
109  if (pos >= nvc.nrows)
110  {
111  return 0; // not found
112  }
113  const uint8_t *data = (nvc.data + *(nvc.offset + pos));
114  *sizeref = (size_t)(*(data++));
115  *sizealt = (size_t)(*(data++));
116  memcpy(ref, data, *sizeref);
117  ref[*sizeref] = 0;
118  memcpy(alt, (data + *sizeref), *sizealt);
119  alt[*sizealt] = 0;
120  return (*sizeref + *sizealt);
121 }
122 
137 static inline size_t find_ref_alt_by_variantkey(nrvk_cols_t nvc, uint64_t vk, char *ref, size_t *sizeref, char *alt, size_t *sizealt)
138 {
139  uint64_t first = 0;
140  uint64_t max = nvc.nrows;
141  uint64_t found = col_find_first_uint64_t(nvc.vk, &first, &max, vk);
142  return get_nrvk_ref_alt_by_pos(nvc, found, ref, sizeref, alt, sizealt);
143 }
144 
154 static inline size_t reverse_variantkey(nrvk_cols_t nvc, uint64_t vk, variantkey_rev_t *rev)
155 {
157  rev->pos = extract_variantkey_pos(vk);
158  size_t len = decode_refalt(extract_variantkey_refalt(vk), rev->ref, &rev->sizeref, rev->alt, &rev->sizealt);
159  if ((len == 0) && (nvc.nrows > 0))
160  {
161  len = find_ref_alt_by_variantkey(nvc, vk, rev->ref, &rev->sizeref, rev->alt, &rev->sizealt);
162  }
163  return len;
164 }
165 
174 static inline size_t get_variantkey_ref_length(nrvk_cols_t nvc, uint64_t vk)
175 {
176  if ((vk & 0x1) == 0) // check last bit for reversible encoding
177  {
178  return ((vk & 0x0000000078000000) >> 27); // [00000000 00000000 00000000 00000000 01111000 00000000 00000000 00000000]
179  }
180  uint64_t first = 0;
181  uint64_t max = nvc.nrows;
182  uint64_t found = col_find_first_uint64_t(nvc.vk, &first, &max, vk);
183  if (found >= nvc.nrows)
184  {
185  return 0; // not found
186  }
187  return (size_t)(*(nvc.data + *(nvc.offset + found)));
188 }
189 
198 static inline uint32_t get_variantkey_endpos(nrvk_cols_t nvc, uint64_t vk)
199 {
200  return (extract_variantkey_pos(vk) + (uint32_t)get_variantkey_ref_length(nvc, vk));
201 }
202 
209 static inline uint64_t get_variantkey_chrom_startpos(uint64_t vk)
210 {
211  return (vk >> VKSHIFT_POS);
212 }
213 
221 static inline uint64_t get_variantkey_chrom_endpos(nrvk_cols_t nvc, uint64_t vk)
222 {
223  return (((vk & VKMASK_CHROM) >> VKSHIFT_POS) | (uint64_t)get_variantkey_endpos(nvc, vk));
224 }
225 
235 static inline size_t nrvk_bin_to_tsv(nrvk_cols_t nvc, const char *tsvfile)
236 {
237  FILE * fp;
238  size_t sizeref, sizealt, len = 0;
239  char ref[ALLELE_MAXSIZE];
240  char alt[ALLELE_MAXSIZE];
241  uint64_t i;
242  fp = fopen(tsvfile, "we");
243  if (fp == NULL)
244  {
245  return 0;
246  }
247  for (i = 0; i < nvc.nrows; i++)
248  {
249  len += (get_nrvk_ref_alt_by_pos(nvc, i, ref, &sizeref, alt, &sizealt) + 19);
250  fprintf(fp, "%016" PRIx64 "\t%s\t%s\n", *nvc.vk++, ref, alt);
251  }
252  fclose(fp);
253  return len;
254 }
255 
256 #endif // VARIANTKEY_NRVK_H
static uint8_t extract_variantkey_chrom(uint64_t vk)
Extract the CHROM code from VariantKey.
Definition: variantkey.h:451
size_t sizeref
Length of reference allele.
Definition: nrvk.h:74
static uint32_t extract_variantkey_pos(uint64_t vk)
Extract the POS code from VariantKey.
Definition: variantkey.h:462
uint64_t nrows
Number of rows.
Definition: binsearch.h:236
size_t sizealt
Length of alternate allele.
Definition: nrvk.h:75
Definition: nrvk.h:68
Definition: binsearch.h:229
const uint64_t * offset
Pointer to the Offset column.
Definition: nrvk.h:84
static uint64_t get_variantkey_chrom_endpos(nrvk_cols_t nvc, uint64_t vk)
Get the CHROM + END POS encoding from VariantKey.
Definition: nrvk.h:221
struct nrvk_cols_t nrvk_cols_t
uint8_t * src
Pointer to the memory map.
Definition: binsearch.h:231
uint32_t pos
Reference position, with the first base having position 0.
Definition: nrvk.h:71
static size_t decode_chrom(uint8_t code, char *chrom)
Decode the chromosome numerical code.
Definition: variantkey.h:145
static size_t decode_refalt(uint32_t code, char *ref, size_t *sizeref, char *alt, size_t *sizealt)
Decode the 32 bit REF+ALT code if reversible (if it has 11 or less bases in total and only contains A...
Definition: variantkey.h:423
static size_t get_nrvk_ref_alt_by_pos(nrvk_cols_t nvc, uint64_t pos, char *ref, size_t *sizeref, char *alt, size_t *sizealt)
Definition: nrvk.h:107
static uint64_t col_find_first_uint64_t(const uint64_t *src, uint64_t *first, uint64_t *last, uint64_t search)
Definition: binsearch.h:708
char chrom[3]
Chromosome.
Definition: nrvk.h:70
static void mmap_binfile(const char *file, mmfile_t *mf)
Definition: binsearch.h:995
struct variantkey_rev_t variantkey_rev_t
static size_t find_ref_alt_by_variantkey(nrvk_cols_t nvc, uint64_t vk, char *ref, size_t *sizeref, char *alt, size_t *sizealt)
Definition: nrvk.h:137
static uint32_t get_variantkey_endpos(nrvk_cols_t nvc, uint64_t vk)
Definition: nrvk.h:198
static size_t reverse_variantkey(nrvk_cols_t nvc, uint64_t vk, variantkey_rev_t *rev)
Definition: nrvk.h:154
static size_t nrvk_bin_to_tsv(nrvk_cols_t nvc, const char *tsvfile)
Definition: nrvk.h:235
static uint32_t extract_variantkey_refalt(uint64_t vk)
Extract the REF+ALT code from VariantKey.
Definition: variantkey.h:473
VariantKey main functions.
static size_t get_variantkey_ref_length(nrvk_cols_t nvc, uint64_t vk)
Definition: nrvk.h:174
Definition: nrvk.h:81
const uint8_t * data
Pointer to the Data column.
Definition: nrvk.h:85
Functions to search values in binary files made of constant-length items.
#define VKSHIFT_POS
POS LSB position from the VariantKey LSB.
Definition: variantkey.h:56
uint64_t index[256]
Index of the offsets to the beginning of each column.
Definition: binsearch.h:239
#define ALLELE_MAXSIZE
Maximum allele length.
Definition: nrvk.h:62
const uint64_t * vk
Pointer to the VariantKey column.
Definition: nrvk.h:83
#define VKMASK_CHROM
VariantKey binary mask for CHROM [ 11111000 00000000 00000000 00000000 00000000 00000000 00000000 000...
Definition: variantkey.h:51
static void mmap_nrvk_file(const char *file, mmfile_t *mf, nrvk_cols_t *nvc)
Definition: nrvk.h:98
char ref[256]
Reference allele.
Definition: nrvk.h:72
char alt[256]
Alternate allele.
Definition: nrvk.h:73
static uint64_t get_variantkey_chrom_startpos(uint64_t vk)
Get the CHROM + START POS encoding from VariantKey.
Definition: nrvk.h:209
uint64_t nrows
Number of rows.
Definition: nrvk.h:86