VariantKey  5.4.1
Numerical Encoding for Human Genetic Variants
esid.h
Go to the documentation of this file.
1 // VariantKey
2 //
3 // esid.h
4 //
5 // @category Libraries
6 // @author Nicola Asuni <nicola.asuni@genomicsplc.com>
7 // @copyright 2017-2018 GENOMICS plc
8 // @license MIT (see LICENSE)
9 // @link https://github.com/genomicsplc/variantkey
10 //
11 // LICENSE
12 //
13 // Copyright (c) 2017-2018 GENOMICS plc
14 //
15 // Permission is hereby granted, free of charge, to any person obtaining a copy
16 // of this software and associated documentation files (the "Software"), to deal
17 // in the Software without restriction, including without limitation the rights
18 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19 // copies of the Software, and to permit persons to whom the Software is
20 // furnished to do so, subject to the following conditions:
21 //
22 // The above copyright notice and this permission notice shall be included in
23 // all copies or substantial portions of the Software.
24 //
25 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
31 // THE SOFTWARE.
32 
40 #ifndef VARIANTKEY_ESID_H
41 #define VARIANTKEY_ESID_H
42 
43 #include <inttypes.h>
44 #include <stdio.h>
45 
46 #define ESID_MAXLEN 10
47 #define ESID_SHIFT 32
48 #define ESID_SHIFTPOS 60
49 #define ESID_CHARBIT 6
50 #define ESID_NUMPOS 27
51 #define ESID_MAXPAD 7
52 
53 static inline uint64_t esid_encode_char(int c)
54 {
55  if (c < '!')
56  {
57  return (uint64_t)('_' - ESID_SHIFT);
58  }
59  if (c > '_')
60  {
61  return (uint64_t)(c - ('a' - 'A' + ESID_SHIFT));
62  }
63  return (uint64_t)(c - ESID_SHIFT);
64 }
65 
66 static inline uint8_t esid_decode_char(uint64_t esid, size_t pos)
67 {
68  return (uint8_t)(((esid >> pos) & 0x3f) + ESID_SHIFT); // 0x3f hex = 63 dec = 00111111 bin
69 }
70 
81 static inline uint64_t encode_string_id(const char *str, size_t size, size_t start)
82 {
83  size -= start;
84  if (size > ESID_MAXLEN)
85  {
86  size = ESID_MAXLEN;
87  }
88  str += start;
89  const char *pos = (str + size - 1);
90  uint64_t h = (size << ESID_SHIFTPOS);
91  switch (size)
92  {
93  case 10:
94  h |= esid_encode_char(*pos--);
95  // fall through
96  case 9:
97  h |= esid_encode_char(*pos--) << (ESID_CHARBIT * 1);
98  // fall through
99  case 8:
100  h |= esid_encode_char(*pos--) << (ESID_CHARBIT * 2);
101  // fall through
102  case 7:
103  h |= esid_encode_char(*pos--) << (ESID_CHARBIT * 3);
104  // fall through
105  case 6:
106  h |= esid_encode_char(*pos--) << (ESID_CHARBIT * 4);
107  // fall through
108  case 5:
109  h |= esid_encode_char(*pos--) << (ESID_CHARBIT * 5);
110  // fall through
111  case 4:
112  h |= esid_encode_char(*pos--) << (ESID_CHARBIT * 6);
113  // fall through
114  case 3:
115  h |= esid_encode_char(*pos--) << (ESID_CHARBIT * 7);
116  // fall through
117  case 2:
118  h |= esid_encode_char(*pos--) << (ESID_CHARBIT * 8);
119  // fall through
120  case 1:
121  h |= esid_encode_char(*pos) << (ESID_CHARBIT * 9);
122  }
123  return h;
124 }
125 
138 static inline uint64_t encode_string_num_id(const char *str, size_t size, char sep)
139 {
140  if (size <= ESID_MAXLEN)
141  {
142  return encode_string_id(str, size, 0);
143  }
144  uint64_t h = 0;
145  uint32_t num = 0;
146  uint8_t nchr = 0, npad = 0;
147  uint8_t bitpos = ESID_SHIFTPOS;
148  int c;
149  while ((c = *str++) && (size--))
150  {
151  if (c == sep)
152  {
153  break;
154  }
155  if (nchr < 5)
156  {
157  bitpos -= ESID_CHARBIT;
158  h |= (esid_encode_char(c) << bitpos);
159  nchr++;
160  }
161  }
162  h |= ((uint64_t)(nchr + ESID_MAXLEN) << ESID_SHIFTPOS); // 4 bit for string length
163  while (((c = *str++) == '0') && (npad < ESID_MAXPAD) && (size--))
164  {
165  npad++;
166  }
167  h |= (npad << ESID_NUMPOS); // 3 bit for 0 padding length
168  while ((c >= '0') && (c <= '9') && (size--))
169  {
170  num = ((num * 10) + (c - '0'));
171  c = *str++;
172  }
173  h |= ((uint64_t)num & 0x7FFFFFF); // 27 bit for number
174  return h;
175 }
176 
177 static inline size_t esid_decode_string_id(size_t size, uint64_t esid, char *str)
178 {
179  switch (size)
180  {
181  case 10:
182  str[9] = esid_decode_char(esid, 0);
183  // fall through
184  case 9:
185  str[8] = esid_decode_char(esid, (ESID_CHARBIT * 1));
186  // fall through
187  case 8:
188  str[7] = esid_decode_char(esid, (ESID_CHARBIT * 2));
189  // fall through
190  case 7:
191  str[6] = esid_decode_char(esid, (ESID_CHARBIT * 3));
192  // fall through
193  case 6:
194  str[5] = esid_decode_char(esid, (ESID_CHARBIT * 4));
195  // fall through
196  case 5:
197  str[4] = esid_decode_char(esid, (ESID_CHARBIT * 5));
198  // fall through
199  case 4:
200  str[3] = esid_decode_char(esid, (ESID_CHARBIT * 6));
201  // fall through
202  case 3:
203  str[2] = esid_decode_char(esid, (ESID_CHARBIT * 7));
204  // fall through
205  case 2:
206  str[1] = esid_decode_char(esid, (ESID_CHARBIT * 8));
207  // fall through
208  case 1:
209  str[0] = esid_decode_char(esid, (ESID_CHARBIT * 9));
210  }
211  str[size] = 0;
212  return size;
213 }
214 
215 static inline size_t esid_decode_string_num_id(size_t size, uint64_t esid, char *str)
216 {
217  size = esid_decode_string_id(size, esid, str);
218  str[size++] = ':';
219  uint8_t npad = (uint8_t)((esid >> ESID_NUMPOS) & ESID_MAXPAD);
220  while (npad--)
221  {
222  str[size++] = '0';
223  }
224  uint64_t num = (esid & 0x7FFFFFF);
225  if (num > 0)
226  {
227  char *ptr = (str + size);
228  size += sprintf(ptr, "%" PRIu64, num);
229  }
230  str[size] = 0;
231  return size;
232 }
233 
244 static inline size_t decode_string_id(uint64_t esid, char *str)
245 {
246  size_t size = (esid >> ESID_SHIFTPOS);
247  if (size > ESID_MAXLEN)
248  {
249  return esid_decode_string_num_id((size - ESID_MAXLEN), esid, str);
250  }
251  return esid_decode_string_id(size, esid, str);
252 }
253 
254 // Mix two 64 bit hash numbers using a MurmurHash3-like algorithm
255 static inline uint64_t muxhash64(uint64_t k, uint64_t h)
256 {
257  k *= 0x87c37b91114253d5;
258  k = (k >> 33) | (k << 31);
259  k *= 0x4cf5ad432745937f;
260  h ^= k;
261  h = (h >> 37) | (h << 27);
262  return ((h * 5) + 0x52dce729);
263 }
264 
274 static inline uint64_t hash_string_id(const char *str, size_t size)
275 {
276  const uint64_t *pos = (const uint64_t *)str; // NOTE endianness
277  const uint64_t *end = pos + (size / 8);
278  uint64_t h = 0;
279  while (pos < end)
280  {
281  h = muxhash64(*pos++, h);
282  }
283  const uint8_t *tail = (const uint8_t *)pos;
284  uint64_t v = 0;
285  switch (size & 7)
286  {
287  case 7:
288  v ^= (uint64_t)tail[6] << (8 * 6);
289  // fall through
290  case 6:
291  v ^= (uint64_t)tail[5] << (8 * 5);
292  // fall through
293  case 5:
294  v ^= (uint64_t)tail[4] << (8 * 4);
295  // fall through
296  case 4:
297  v ^= (uint64_t)tail[3] << (8 * 3);
298  // fall through
299  case 3:
300  v ^= (uint64_t)tail[2] << (8 * 2);
301  // fall through
302  case 2:
303  v ^= (uint64_t)tail[1] << (8 * 1);
304  // fall through
305  case 1:
306  v ^= (uint64_t)tail[0];
307  }
308  if (v > 0)
309  {
310  h = muxhash64(v, h);
311  }
312  // MurmurHash3 finalization mix - force all bits of a hash block to avalanche
313  h ^= h >> 33;
314  h *= 0xff51afd7ed558ccd;
315  h ^= h >> 33;
316  h *= 0xc4ceb9fe1a85ec53;
317  h ^= h >> 33;
318  return (h | 0x8000000000000000); // set the first bit to indicate HASH mode
319 }
320 
321 #endif // VARIANTKEY_ESID_H
#define ESID_SHIFTPOS
Encoded string ID LEN LSB position from LSB [ -—0000 00111111 22222233 33334444 44555555 66666677 77...
Definition: esid.h:48
static uint64_t encode_string_num_id(const char *str, size_t size, char sep)
Definition: esid.h:138
static size_t esid_decode_string_num_id(size_t size, uint64_t esid, char *str)
Definition: esid.h:215
static size_t decode_string_id(uint64_t esid, char *str)
Definition: esid.h:244
static uint8_t esid_decode_char(uint64_t esid, size_t pos)
Definition: esid.h:66
#define ESID_SHIFT
Number used to translate ASCII character values.
Definition: esid.h:47
static uint64_t encode_string_id(const char *str, size_t size, size_t start)
Definition: esid.h:81
static uint64_t muxhash64(uint64_t k, uint64_t h)
Definition: esid.h:255
#define ESID_MAXPAD
Max number of padding zero digits.
Definition: esid.h:51
static uint64_t hash_string_id(const char *str, size_t size)
Definition: esid.h:274
#define ESID_NUMPOS
Number of bit used to encode a number in the srting_num encoding.
Definition: esid.h:50
#define ESID_CHARBIT
Number of bit used to encode a char.
Definition: esid.h:49
#define ESID_MAXLEN
Maximum number of characters that can be encoded.
Definition: esid.h:46
static size_t esid_decode_string_id(size_t size, uint64_t esid, char *str)
Definition: esid.h:177
static uint64_t esid_encode_char(int c)
Definition: esid.h:53