VariantKey  5.4.1
Numerical Encoding for Human Genetic Variants
variantkey.h File Reference

VariantKey main functions. More...

#include <inttypes.h>
#include <stddef.h>
#include <stdio.h>
#include "hex.h"

Go to the source code of this file.

Data Structures

struct  variantkey_t
 
struct  vkrange_t
 

Macros

#define VKMASK_CHROM   0xF800000000000000
 VariantKey binary mask for CHROM [ 11111000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ]. More...
 
#define VKMASK_POS   0x07FFFFFF80000000
 VariantKey binary mask for POS [ 00000111 11111111 11111111 11111111 10000000 00000000 00000000 00000000 ]. More...
 
#define VKMASK_CHROMPOS   0xFFFFFFFF80000000
 VariantKey binary mask for CHROM+POS [ 11111111 11111111 11111111 11111111 10000000 00000000 00000000 00000000 ]. More...
 
#define VKMASK_REFALT   0x000000007FFFFFFF
 VariantKey binary mask for REF+ALT [ 00000000 00000000 00000000 00000000 01111111 11111111 11111111 11111111 ]. More...
 
#define VKSHIFT_CHROM   59
 CHROM LSB position from the VariantKey LSB. More...
 
#define VKSHIFT_POS   31
 POS LSB position from the VariantKey LSB. More...
 
#define MAXUINT32   0xFFFFFFFF
 Maximum value for uint32_t. More...
 

Typedefs

typedef struct variantkey_t variantkey_t
 
typedef struct vkrange_t vkrange_t
 

Functions

static uint8_t encode_chrom (const char *chrom, size_t size)
 Returns chromosome numerical encoding. More...
 
static size_t decode_chrom (uint8_t code, char *chrom)
 Decode the chromosome numerical code. More...
 
static uint32_t encode_base (const uint8_t c)
 
static int encode_allele (uint32_t *h, uint8_t *bitpos, const char *str, size_t size)
 
static uint32_t encode_refalt_rev (const char *ref, size_t sizeref, const char *alt, size_t sizealt)
 
static uint32_t muxhash (uint32_t k, uint32_t h)
 
static uint32_t encode_packchar (int c)
 
static uint32_t pack_chars_tail (const char *str, size_t size)
 
static uint32_t pack_chars (const char *str)
 
static uint32_t hash32 (const char *str, size_t size)
 
static uint32_t encode_refalt_hash (const char *ref, size_t sizeref, const char *alt, size_t sizealt)
 
static uint32_t encode_refalt (const char *ref, size_t sizeref, const char *alt, size_t sizealt)
 Returns reference+alternate numerical encoding. More...
 
static char decode_base (uint32_t code, int bitpos)
 
static size_t decode_refalt_rev (uint32_t code, char *ref, size_t *sizeref, char *alt, size_t *sizealt)
 
static size_t decode_refalt (uint32_t code, char *ref, size_t *sizeref, char *alt, size_t *sizealt)
 Decode the 32 bit REF+ALT code if reversible (if it has 11 or less bases in total and only contains ACGT letters). More...
 
static uint64_t encode_variantkey (uint8_t chrom, uint32_t pos, uint32_t refalt)
 Returns a 64 bit variant key based on the pre-encoded CHROM, POS (0-based) and REF+ALT. More...
 
static uint8_t extract_variantkey_chrom (uint64_t vk)
 Extract the CHROM code from VariantKey. More...
 
static uint32_t extract_variantkey_pos (uint64_t vk)
 Extract the POS code from VariantKey. More...
 
static uint32_t extract_variantkey_refalt (uint64_t vk)
 Extract the REF+ALT code from VariantKey. More...
 
static void decode_variantkey (uint64_t code, variantkey_t *vk)
 Decode a VariantKey code and returns the components as variantkey_t structure. More...
 
static uint64_t variantkey (const char *chrom, size_t sizechrom, uint32_t pos, const char *ref, size_t sizeref, const char *alt, size_t sizealt)
 
static void variantkey_range (uint8_t chrom, uint32_t pos_min, uint32_t pos_max, vkrange_t *range)
 Returns minimum and maximum VariantKeys for range searches. More...
 
static int8_t compare_uint64_t (uint64_t a, uint64_t b)
 
static int8_t compare_variantkey_chrom (uint64_t vka, uint64_t vkb)
 Compares two VariantKeys by chromosome only. More...
 
static int8_t compare_variantkey_chrom_pos (uint64_t vka, uint64_t vkb)
 Compares two VariantKeys by chromosome and position. More...
 
static size_t variantkey_hex (uint64_t vk, char *str)
 Returns VariantKey hexadecimal string (16 characters). More...
 
static uint64_t parse_variantkey_hex (const char *vs)
 Parses a VariantKey hexadecimal string and returns the code. More...
 

Detailed Description

The functions provided here allows to generate and process a 64 bit Unsigned Integer Keys for Human Genetic Variants. The VariantKey is sortable for chromosome and position, and it is also fully reversible for variants with up to 11 bases between Reference and Alternate alleles. It can be used to sort, search and match variant-based data easily and very quickly.

Macro Definition Documentation

#define MAXUINT32   0xFFFFFFFF
#define VKMASK_CHROM   0xF800000000000000
#define VKMASK_CHROMPOS   0xFFFFFFFF80000000
#define VKMASK_POS   0x07FFFFFF80000000
#define VKMASK_REFALT   0x000000007FFFFFFF
#define VKSHIFT_CHROM   59
#define VKSHIFT_POS   31

Typedef Documentation

typedef struct variantkey_t variantkey_t

VariantKey struct. Contains the numerically encoded VariantKey components (CHROM, POS, REF+ALT).

typedef struct vkrange_t vkrange_t

Struct containing the minimum and maximum VariantKey values for range searches.

Function Documentation

static int8_t compare_uint64_t ( uint64_t  a,
uint64_t  b 
)
inlinestatic
static int8_t compare_variantkey_chrom ( uint64_t  vka,
uint64_t  vkb 
)
inlinestatic
Parameters
vkaThe first VariantKey to be compared.
vkbThe second VariantKey to be compared.
Returns
-1 if the first chromosome is smaller than the second, 0 if they are equal and 1 if the first is greater than the second.
static int8_t compare_variantkey_chrom_pos ( uint64_t  vka,
uint64_t  vkb 
)
inlinestatic
Parameters
vkaThe first VariantKey to be compared.
vkbThe second VariantKey to be compared.
Returns
-1 if the first CHROM+POS is smaller than the second, 0 if they are equal and 1 if the first is greater than the second.
static char decode_base ( uint32_t  code,
int  bitpos 
)
inlinestatic
static size_t decode_chrom ( uint8_t  code,
char *  chrom 
)
inlinestatic
Parameters
codeCHROM code.
chromCHROM string buffer to be returned. Its size should be enough to contain the results (max 4 bytes).
Returns
If successful, the total number of characters written is returned, excluding the null-character appended at the end of the string, otherwise a negative number is returned in case of failure.
static size_t decode_refalt ( uint32_t  code,
char *  ref,
size_t *  sizeref,
char *  alt,
size_t *  sizealt 
)
inlinestatic
Parameters
codeREF+ALT code
refREF string buffer to be returned.
sizerefPointer to the size of the ref buffer, excluding the terminating null byte. This will contain the final ref size.
altALT string buffer to be returned.
sizealtPointer to the size of the alt buffer, excluding the terminating null byte. This will contain the final alt size.
Returns
If the code is reversible, then the total number of characters of REF+ALT is returned. Otherwise 0 is returned.
static size_t decode_refalt_rev ( uint32_t  code,
char *  ref,
size_t *  sizeref,
char *  alt,
size_t *  sizealt 
)
inlinestatic
static void decode_variantkey ( uint64_t  code,
variantkey_t vk 
)
inlinestatic
Parameters
codeVariantKey code.
vkDecoded variantkey structure.
static int encode_allele ( uint32_t *  h,
uint8_t *  bitpos,
const char *  str,
size_t  size 
)
inlinestatic
static uint32_t encode_base ( const uint8_t  c)
inlinestatic
static uint8_t encode_chrom ( const char *  chrom,
size_t  size 
)
inlinestatic
Parameters
chromChromosome. An identifier from the reference genome, no white-space permitted.
sizeLength of the chrom string, excluding the terminating null byte.
Returns
CHROM code
static uint32_t encode_packchar ( int  c)
inlinestatic
static uint32_t encode_refalt ( const char *  ref,
size_t  sizeref,
const char *  alt,
size_t  sizealt 
)
inlinestatic
Parameters
refReference allele. String containing a sequence of nucleotide letters. The value in the pos field refers to the position of the first nucleotide in the String. Characters must be A-Z, a-z or *
sizerefLength of the ref string, excluding the terminating null byte.
altAlternate non-reference allele string. Characters must be A-Z, a-z or *
sizealtLength of the alt string, excluding the terminating null byte.
Returns
REF+ALT code
static uint32_t encode_refalt_hash ( const char *  ref,
size_t  sizeref,
const char *  alt,
size_t  sizealt 
)
inlinestatic
static uint32_t encode_refalt_rev ( const char *  ref,
size_t  sizeref,
const char *  alt,
size_t  sizealt 
)
inlinestatic
static uint64_t encode_variantkey ( uint8_t  chrom,
uint32_t  pos,
uint32_t  refalt 
)
inlinestatic
Parameters
chromEncoded Chromosome (see encode_chrom).
posPosition. The reference position, with the first base having position 0.
refaltEncoded Reference + Alternate (see encode_refalt).
Returns
VariantKey 64 bit code.
static uint8_t extract_variantkey_chrom ( uint64_t  vk)
inlinestatic
Parameters
vkVariantKey code.
Returns
CHROM code.
static uint32_t extract_variantkey_pos ( uint64_t  vk)
inlinestatic
Parameters
vkVariantKey code.
Returns
POS.
static uint32_t extract_variantkey_refalt ( uint64_t  vk)
inlinestatic
Parameters
vkVariantKey code.
Returns
REF+ALT code.
static uint32_t hash32 ( const char *  str,
size_t  size 
)
inlinestatic
static uint32_t muxhash ( uint32_t  k,
uint32_t  h 
)
inlinestatic
static uint32_t pack_chars ( const char *  str)
inlinestatic
static uint32_t pack_chars_tail ( const char *  str,
size_t  size 
)
inlinestatic
static uint64_t parse_variantkey_hex ( const char *  vs)
inlinestatic
Parameters
vsVariantKey hexadecimal string (it must contain 16 hexadecimal characters).
Returns
A VariantKey code.
static uint64_t variantkey ( const char *  chrom,
size_t  sizechrom,
uint32_t  pos,
const char *  ref,
size_t  sizeref,
const char *  alt,
size_t  sizealt 
)
inlinestatic

Returns a 64 bit variant key based on CHROM, POS (0-based), REF, ALT. The variant should be already normalized (see normalize_variant or use normalized_variantkey).

Parameters
chromChromosome. An identifier from the reference genome, no white-space or leading zeros permitted.
sizechromLength of the chrom string, excluding the terminating null byte.
posPosition. The reference position, with the first base having position 0.
refReference allele. String containing a sequence of nucleotide letters. The value in the pos field refers to the position of the first nucleotide in the String. Characters must be A-Z, a-z or *
sizerefLength of the ref string, excluding the terminating null byte.
altAlternate non-reference allele string. Characters must be A-Z, a-z or *
sizealtLength of the alt string, excluding the terminating null byte.
Returns
VariantKey 64 bit code.
static size_t variantkey_hex ( uint64_t  vk,
char *  str 
)
inlinestatic

The string represent a 64 bit number or:

  • 5 bit for CHROM
  • 28 bit for POS
  • 31 bit for REF+ALT
Parameters
vkVariantKey code.
strString buffer to be returned (it must be sized 17 bytes at least).
Returns
Upon successful return, these function returns the number of characters processed (excluding the null byte used to end output to strings). If the buffer size is not sufficient, then the return value is the number of characters required for buffer string, including the terminating null byte.
static void variantkey_range ( uint8_t  chrom,
uint32_t  pos_min,
uint32_t  pos_max,
vkrange_t range 
)
inlinestatic
Parameters
chromChromosome encoded number.
pos_minStart reference position, with the first base having position 0.
pos_maxEnd reference position, with the first base having position 0.
rangeVariantKey range values.