VariantKey  5.4.1
Numerical Encoding for Human Genetic Variants
esid.h File Reference

Utility functions to encode strings. More...

#include <inttypes.h>
#include <stdio.h>

Go to the source code of this file.

Macros

#define ESID_MAXLEN   10
 Maximum number of characters that can be encoded. More...
 
#define ESID_SHIFT   32
 Number used to translate ASCII character values. More...
 
#define ESID_SHIFTPOS   60
 Encoded string ID LEN LSB position from LSB [ -—0000 00111111 22222233 33334444 44555555 66666677 77778888 88999999 ]. More...
 
#define ESID_CHARBIT   6
 Number of bit used to encode a char. More...
 
#define ESID_NUMPOS   27
 Number of bit used to encode a number in the srting_num encoding. More...
 
#define ESID_MAXPAD   7
 Max number of padding zero digits. More...
 

Functions

static uint64_t esid_encode_char (int c)
 
static uint8_t esid_decode_char (uint64_t esid, size_t pos)
 
static uint64_t encode_string_id (const char *str, size_t size, size_t start)
 
static uint64_t encode_string_num_id (const char *str, size_t size, char sep)
 
static size_t esid_decode_string_id (size_t size, uint64_t esid, char *str)
 
static size_t esid_decode_string_num_id (size_t size, uint64_t esid, char *str)
 
static size_t decode_string_id (uint64_t esid, char *str)
 
static uint64_t muxhash64 (uint64_t k, uint64_t h)
 
static uint64_t hash_string_id (const char *str, size_t size)
 

Detailed Description

Utility functions to encode strings.

Macro Definition Documentation

#define ESID_CHARBIT   6
#define ESID_MAXLEN   10
#define ESID_MAXPAD   7
#define ESID_NUMPOS   27
#define ESID_SHIFT   32
#define ESID_SHIFTPOS   60

Function Documentation

static size_t decode_string_id ( uint64_t  esid,
char *  str 
)
inlinestatic

Decode the encoded string ID. This function is the reverse of encode_string_id. The string is always returned in uppercase mode.

Parameters
esidEncoded string ID code.
strString buffer to be returned. Its size should be enough to contain the results (at least 11 bytes).
Returns
The total number of characters excluding the null-character appended at the end of the string.
static uint64_t encode_string_id ( const char *  str,
size_t  size,
size_t  start 
)
inlinestatic

Encode maximum 10 characters of a string into a 64 bit unsigned integer. This function can be used to convert generic string IDs to numeric IDs.

Parameters
strThe string to encode. It must be maximum 10 characters long and support ASCII characters from '!' to 'z'.
sizeLength of the string, excluding the terminating null byte.
startFirst character to encode, starting from 0. To encode the last 10 characters, set this value at (size - 10).
Returns
Encoded string ID.
static uint64_t encode_string_num_id ( const char *  str,
size_t  size,
char  sep 
)
inlinestatic

Encode a string composed by a character section followed by a separator character and a numerical section into a 64 bit unsigned integer. For example: "ABCDE:0001234". Encodes up to 5 characters in uppercase, a number up to 2^27, and up to 7 zero padding digits. If the string is 10 character or less, then the encode_string_id() is used.

Parameters
strThe string to encode. Support ASCII characters from '!' to 'z'.
sizeLength of the string, excluding the terminating null byte.
sepSeparator character between string and number.
Returns
Encoded string ID.
static uint8_t esid_decode_char ( uint64_t  esid,
size_t  pos 
)
inlinestatic
static size_t esid_decode_string_id ( size_t  size,
uint64_t  esid,
char *  str 
)
inlinestatic
static size_t esid_decode_string_num_id ( size_t  size,
uint64_t  esid,
char *  str 
)
inlinestatic
static uint64_t esid_encode_char ( int  c)
inlinestatic
static uint64_t hash_string_id ( const char *  str,
size_t  size 
)
inlinestatic

Hash the input string into a 64 bit unsigned integer. This function can be used to convert long string IDs into non-reversible numeric IDs.

Parameters
strThe string to encode.
sizeLength of the string, excluding the terminating null byte.
Returns
Hash string ID.
static uint64_t muxhash64 ( uint64_t  k,
uint64_t  h 
)
inlinestatic