VariantKey  5.4.1
Numerical Encoding for Human Genetic Variants
regionkey.h
Go to the documentation of this file.
1 // RegionKey
2 //
3 // regionkey.h
4 //
5 // @category Libraries
6 // @author Nicola Asuni <nicola.asuni@genomicsplc.com>
7 // @copyright 2017-2018 GENOMICS plc
8 // @license MIT (see LICENSE)
9 // @link https://github.com/genomicsplc/regionkey
10 //
11 // LICENSE
12 //
13 // Copyright (c) 2017-2018 GENOMICS plc
14 //
15 // Permission is hereby granted, free of charge, to any person obtaining a copy
16 // of this software and associated documentation files (the "Software"), to deal
17 // in the Software without restriction, including without limitation the rights
18 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19 // copies of the Software, and to permit persons to whom the Software is
20 // furnished to do so, subject to the following conditions:
21 //
22 // The above copyright notice and this permission notice shall be included in
23 // all copies or substantial portions of the Software.
24 //
25 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
31 // THE SOFTWARE.
32 
41 #ifndef VARIANTKEY_REGIONKEY_H
42 #define VARIANTKEY_REGIONKEY_H
43 
44 #include <stdio.h>
45 #include "nrvk.h"
46 
47 #define RK_MAX_POS 0x000000000FFFFFFF
48 #define RKMASK_CHROM 0xF800000000000000
49 #define RKMASK_STARTPOS 0x07FFFFFF80000000
50 #define RKMASK_ENDPOS 0x000000007FFFFFF8
51 #define RKMASK_STRAND 0x0000000000000006
52 #define RKMASK_NOPOS 0xF800000000000007
53 #define RKSHIFT_CHROM 59
54 #define RKSHIFT_STARTPOS 31
55 #define RKSHIFT_ENDPOS 3
56 #define RKSHIFT_STRAND 1
57 
58 #define RK_CHROM ((rk & RKMASK_CHROM) >> RKSHIFT_CHROM)
59 #define RK_STARTPOS ((rk & RKMASK_STARTPOS) >> RKSHIFT_STARTPOS)
60 #define RK_ENDPOS ((rk & RKMASK_ENDPOS) >> RKSHIFT_ENDPOS)
61 #define RK_STRAND ((rk & RKMASK_STRAND) >> RKSHIFT_STRAND)
62 
63 
67 typedef struct regionkey_t
68 {
69  uint8_t chrom;
70  uint32_t startpos;
71  uint32_t endpos;
72  uint8_t strand;
73 } regionkey_t;
74 
78 typedef struct regionkey_rev_t
79 {
80  char chrom[3];
81  uint32_t startpos;
82  uint32_t endpos;
83  int8_t strand;
85 
92 static inline uint8_t encode_region_strand(int8_t strand)
93 {
94  static const uint8_t map[] = {2, 0, 1, 0};
95  return map[((uint8_t)(++strand) & 3)];
96 }
97 
104 static inline int8_t decode_region_strand(uint8_t strand)
105 {
106  static const int8_t map[] = {0, 1, -1, 0};
107  return map[(strand & 3)];
108 }
109 
119 static inline uint64_t encode_regionkey(uint8_t chrom, uint32_t startpos, uint32_t endpos, uint8_t strand)
120 {
121  return (((uint64_t)chrom << RKSHIFT_CHROM) | ((uint64_t)startpos << RKSHIFT_STARTPOS) | ((uint64_t)endpos << RKSHIFT_ENDPOS) | ((uint64_t)strand << RKSHIFT_STRAND));
122 }
123 
130 static inline uint8_t extract_regionkey_chrom(uint64_t rk)
131 {
132  return (uint8_t)RK_CHROM;
133 }
134 
141 static inline uint32_t extract_regionkey_startpos(uint64_t rk)
142 {
143  return (uint32_t)RK_STARTPOS;
144 }
145 
152 static inline uint32_t extract_regionkey_endpos(uint64_t rk)
153 {
154  return (uint32_t)RK_ENDPOS;
155 }
156 
163 static inline uint8_t extract_regionkey_strand(uint64_t rk)
164 {
165  return (uint8_t)RK_STRAND;
166 }
167 
173 static inline void decode_regionkey(uint64_t code, regionkey_t *rk)
174 {
175  rk->chrom = extract_regionkey_chrom(code);
177  rk->endpos = extract_regionkey_endpos(code);
178  rk->strand = extract_regionkey_strand(code);
179 }
180 
187 static inline void reverse_regionkey(uint64_t rk, regionkey_rev_t *rev)
188 {
191  rev->endpos = extract_regionkey_endpos(rk);
193 }
194 
205 static inline uint64_t regionkey(const char *chrom, size_t sizechrom, uint32_t startpos, uint32_t endpos, int8_t strand)
206 {
207  return encode_regionkey(encode_chrom(chrom, sizechrom), startpos, endpos, encode_region_strand(strand));
208 }
209 
215 static inline uint64_t extend_regionkey(uint64_t rk, uint32_t size)
216 {
217  uint64_t startpos = RK_STARTPOS;
218  uint64_t endpos = RK_ENDPOS;
219  startpos = ((size >= startpos) ? 0 : (startpos - size));
220  endpos = (((RK_MAX_POS - endpos) <= size) ? RK_MAX_POS : (endpos + size));
221  return ((rk & RKMASK_NOPOS) | (startpos << RKSHIFT_STARTPOS) | (endpos << RKSHIFT_ENDPOS));
222 }
223 
234 static inline size_t regionkey_hex(uint64_t rk, char *str)
235 {
236  return hex_uint64_t(rk, str);
237 }
238 
245 static inline uint64_t parse_regionkey_hex(const char *rs)
246 {
247  return parse_hex_uint64_t(rs);
248 }
249 
256 static inline uint64_t get_regionkey_chrom_startpos(uint64_t rk)
257 {
258  return (rk >> RKSHIFT_STARTPOS);
259 }
260 
267 static inline uint64_t get_regionkey_chrom_endpos(uint64_t rk)
268 {
269  return (((rk & RKMASK_CHROM) >> RKSHIFT_STARTPOS) | extract_regionkey_endpos(rk));
270 }
271 
283 static inline uint8_t are_overlapping_regions(uint8_t a_chrom, uint32_t a_startpos, uint32_t a_endpos, uint8_t b_chrom, uint32_t b_startpos, uint32_t b_endpos)
284 {
285  return (uint8_t)((a_chrom == b_chrom) && (a_startpos < b_endpos) && (a_endpos > b_startpos));
286 }
287 
297 static inline uint8_t are_overlapping_region_regionkey(uint8_t chrom, uint32_t startpos, uint32_t endpos, uint64_t rk)
298 {
299  return (uint8_t)((chrom == extract_regionkey_chrom(rk)) && (startpos < extract_regionkey_endpos(rk)) && (endpos > extract_regionkey_startpos(rk)));
300 }
301 
309 static inline uint8_t are_overlapping_regionkeys(uint64_t rka, uint64_t rkb)
310 {
312 }
313 
322 static inline uint8_t are_overlapping_variantkey_regionkey(nrvk_cols_t nvc, uint64_t vk, uint64_t rk)
323 {
325 }
326 
334 static inline uint64_t variantkey_to_regionkey(nrvk_cols_t nvc, uint64_t vk)
335 {
336  return ((vk & VKMASK_CHROMPOS) | ((uint64_t)get_variantkey_endpos(nvc, vk) << RKSHIFT_ENDPOS));
337 }
338 
339 #endif // VARIANTKEY_REGIONKEY_H
#define RK_MAX_POS
Maximum position value (2^28 - 1)
Definition: regionkey.h:47
static uint64_t get_regionkey_chrom_endpos(uint64_t rk)
Get the CHROM + END POS encoding from RegionKey.
Definition: regionkey.h:267
struct regionkey_t regionkey_t
static uint8_t extract_variantkey_chrom(uint64_t vk)
Extract the CHROM code from VariantKey.
Definition: variantkey.h:451
static uint32_t extract_regionkey_endpos(uint64_t rk)
Extract the END POS code from RegionKey.
Definition: regionkey.h:152
static uint8_t are_overlapping_variantkey_regionkey(nrvk_cols_t nvc, uint64_t vk, uint64_t rk)
Check if variantkey and regionkey are overlapping.
Definition: regionkey.h:322
uint32_t endpos
Region end position (pos_start + region_length)
Definition: regionkey.h:82
static uint32_t extract_variantkey_pos(uint64_t vk)
Extract the POS code from VariantKey.
Definition: variantkey.h:462
struct regionkey_rev_t regionkey_rev_t
uint32_t startpos
Region start position (zero based)
Definition: regionkey.h:70
static uint64_t get_regionkey_chrom_startpos(uint64_t rk)
Get the CHROM + START POS encoding from RegionKey.
Definition: regionkey.h:256
static uint8_t are_overlapping_regions(uint8_t a_chrom, uint32_t a_startpos, uint32_t a_endpos, uint8_t b_chrom, uint32_t b_startpos, uint32_t b_endpos)
Check if two regions are overlapping.
Definition: regionkey.h:283
static size_t regionkey_hex(uint64_t rk, char *str)
Returns RegionKey hexadecimal string (16 characters).
Definition: regionkey.h:234
uint32_t endpos
Region end position (pos_start + region_length)
Definition: regionkey.h:71
#define RK_ENDPOS
Extract the END POS code from RegionKey.
Definition: regionkey.h:60
#define RKMASK_CHROM
RegionKey binary mask for CHROM [ 11111000 00000000 00000000 00000000 00000000 00000000 00000000 0000...
Definition: regionkey.h:48
static int8_t decode_region_strand(uint8_t strand)
Decode the strand direction code (0 > 0, 1 > +1, 2 > -1).
Definition: regionkey.h:104
static size_t decode_chrom(uint8_t code, char *chrom)
Decode the chromosome numerical code.
Definition: variantkey.h:145
#define RKMASK_NOPOS
RegionKey binary mask WITHOUT POS [ 11111000 00000000 00000000 00000000 00000000 00000000 00000000 00...
Definition: regionkey.h:52
#define RK_STARTPOS
Extract the START POS code from RegionKey.
Definition: regionkey.h:59
static uint8_t are_overlapping_regionkeys(uint64_t rka, uint64_t rkb)
Check if two regionkeys are overlapping.
Definition: regionkey.h:309
uint32_t startpos
Region start position (zero based)
Definition: regionkey.h:81
#define RKSHIFT_STARTPOS
START POS LSB position from the VariantKey LSB.
Definition: regionkey.h:54
static uint8_t are_overlapping_region_regionkey(uint8_t chrom, uint32_t startpos, uint32_t endpos, uint64_t rk)
Check if a region and a regionkey are overlapping.
Definition: regionkey.h:297
static uint64_t regionkey(const char *chrom, size_t sizechrom, uint32_t startpos, uint32_t endpos, int8_t strand)
Returns a 64 bit regionkey based on CHROM, START POS (0-based), END POS and STRAND.
Definition: regionkey.h:205
static uint64_t extend_regionkey(uint64_t rk, uint32_t size)
Extend a regionkey region by a fixed amount from the start and end position.
Definition: regionkey.h:215
static uint64_t encode_regionkey(uint8_t chrom, uint32_t startpos, uint32_t endpos, uint8_t strand)
Returns a 64 bit regionkey.
Definition: regionkey.h:119
static void reverse_regionkey(uint64_t rk, regionkey_rev_t *rev)
Definition: regionkey.h:187
static uint32_t get_variantkey_endpos(nrvk_cols_t nvc, uint64_t vk)
Definition: nrvk.h:198
uint8_t strand
Encoded region strand direction (0 > 0, 1 > +1, 2 > -1).
Definition: regionkey.h:72
static uint32_t extract_regionkey_startpos(uint64_t rk)
Extract the START POS code from RegionKey.
Definition: regionkey.h:141
Definition: nrvk.h:81
static uint64_t variantkey_to_regionkey(nrvk_cols_t nvc, uint64_t vk)
Get RegionKey from VariantKey.
Definition: regionkey.h:334
Definition: regionkey.h:78
#define RK_STRAND
Extract the STRAND from RegionKey.
Definition: regionkey.h:61
Definition: regionkey.h:67
uint8_t chrom
Chromosome encoded number (only the LSB 5 bit are used)
Definition: regionkey.h:69
static uint8_t encode_chrom(const char *chrom, size_t size)
Returns chromosome numerical encoding.
Definition: variantkey.h:86
int8_t strand
Region strand direction (-1, 0, +1)
Definition: regionkey.h:83
static uint8_t encode_region_strand(int8_t strand)
Encode the strand direction (-1 > 2, 0 > 0, +1 > 1).
Definition: regionkey.h:92
static uint8_t extract_regionkey_strand(uint64_t rk)
Extract the STRAND from RegionKey.
Definition: regionkey.h:163
Functions to retrieve REF and ALT values by VariantKey from binary data file.
char chrom[3]
Chromosome.
Definition: regionkey.h:80
static uint64_t parse_regionkey_hex(const char *rs)
Parses a RegionKey hexadecimal string and returns the code.
Definition: regionkey.h:245
static void decode_regionkey(uint64_t code, regionkey_t *rk)
Decode a RegionKey code and returns the components as regionkey_t structure.
Definition: regionkey.h:173
static uint64_t parse_hex_uint64_t(const char *s)
Parses a 16 chars hexadecimal string and returns the code.
Definition: hex.h:67
static uint8_t extract_regionkey_chrom(uint64_t rk)
Extract the CHROM code from RegionKey.
Definition: regionkey.h:130
#define RK_CHROM
Extract the CHROM code from RegionKey.
Definition: regionkey.h:58
#define VKMASK_CHROMPOS
VariantKey binary mask for CHROM+POS [ 11111111 11111111 11111111 11111111 10000000 00000000 00000000...
Definition: variantkey.h:53
#define RKSHIFT_STRAND
STRAND LSB position from the VariantKey LSB.
Definition: regionkey.h:56
#define RKSHIFT_CHROM
CHROM LSB position from the VariantKey LSB.
Definition: regionkey.h:53
#define RKSHIFT_ENDPOS
END POS LSB position from the VariantKey LSB.
Definition: regionkey.h:55
static size_t hex_uint64_t(uint64_t n, char *str)
Returns uint64_t hexadecimal string (16 characters).
Definition: hex.h:56