Skip to content

Commit

Permalink
Support for 128 bit base type for mers.
Browse files Browse the repository at this point in the history
* Some program don't run at this size.
  • Loading branch information
gmarcais committed Jul 16, 2024
1 parent d3973cd commit 11601b1
Show file tree
Hide file tree
Showing 11 changed files with 732 additions and 236 deletions.
3 changes: 2 additions & 1 deletion Tupfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ include_rules
export PKG_CONFIG_PATH

# CXXFLAGS=-O3 -DNDEBUG -Wall -Werror -DHAVE_EXECINFO_H `pkg-config --cflags libxxhash` -pthread
CXXFLAGS=-Wall -Werror -DHAVE_EXECINFO_H -I$(TUP_VARIANTDIR) -pthread -std=c++20 -DHAVE_INT128
# CXXFLAGS=-Wall -Werror -DHAVE_EXECINFO_H -I$(TUP_VARIANTDIR) -pthread -std=c++20 -DHAVE_INT128
CXXFLAGS=-Wall -DHAVE_EXECINFO_H -I$(TUP_VARIANTDIR) -pthread -std=gnu++20 -DHAVE_INT128
LDFLAGS=-pthread
LDLIBS=

Expand Down
10 changes: 10 additions & 0 deletions common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <set>
#include <unordered_set>
#include <unordered_map>
#include <optional>

// State of a mer. Either it is nil (unknown), no (absent), yes (present) or
// blocked (should not take part in an F-move).
Expand Down Expand Up @@ -118,4 +119,13 @@ std::ostream& operator<<(std::ostream& os, const std::pair<T1, T2>& p) {
return os << p.first << ':' << p.second;
}

template<typename T>
std::ostream& operator<<(std::ostream& os, const std::optional<T>& x) {
if(x)
os << "Some(" << *x << ')';
else
os << "None";
return os;
}

#endif // COMMON_H_
11 changes: 11 additions & 0 deletions configure.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ ALPHA (the alphabet size) and K (the k-mer length) are required. Extra
compilation flags or option can be passed in the following environment
variables:
CXX Path to g++ version at least 12
CXXFLAGS Compilation flags
LDFLAGS Linker flags
LDLIBS Extra libraries flags
Expand Down Expand Up @@ -90,12 +91,22 @@ done
[ -z "$ILPPYTHON" ] && echo >&2 "No ILP: didn't find a satisfying Python interpreter and packages"
fi

# Find a valid version for g++
GCXX=
for gcc in $CXX g++ g++-12; do
$gcc -o check_gcc_version -O0 check_gcc_version.cc
./check_gcc_version 12 0 && GCXX=$gcc && break
done
rm -f check_gcc_version
[ -z "$GCXX" ] && { echo >&2 "Didn't find g++ version at least 12.0"; false; }

mkdir -p configs
confFile=configs/${NAME}.config
tmpFile=${confFile}.tmp
cat > "$tmpFile" <<EOF
CONFIG_ALPHA=$ALPHA
CONFIG_K=$K
CONFIG_CXX=$GCXX
CONFIG_CXXFLAGS=$OPTFLAGS $CXXFLAGS
CONFIG_LDFLAGS=$LDFLAGS
CONFIG_LDLIBS=$LDLIBS
Expand Down
181 changes: 172 additions & 9 deletions mer_op.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,29 @@
#ifndef MER_OP_H
#define MER_OP_H

#include <inttypes.h>
// #include <inttypes.h>
#include <type_traits>
#include <cstdint>
#include <limits>
#include <algorithm>
#include <numeric>
#include <ostream>
#include <iostream>

// Print 128 bit long integers. Not very fast. Ignore formatting
std::ostream& operator<<(std::ostream& os, __uint128_t x) {
static constexpr int buflen = 40;
char buf[buflen];
char* ptr = &buf[buflen - 1];
*ptr = '\0';

do {
--ptr;
*ptr = ((char)(x % 10 + '0'));
x /= 10;
} while(x > 0);
return os << ptr;
}

// Number of bits to encode a^k
constexpr unsigned int log2ak(unsigned int a, unsigned k) {
Expand Down Expand Up @@ -44,7 +61,8 @@ struct optimal_int<bits, T> {
};

template<typename T>
constexpr typename std::enable_if<std::is_integral<T>::value, T>::type
// constexpr typename std::enable_if<std::is_integral<T>::value, T>::type
constexpr T
ipow(T base, unsigned int exp) {
T result = 1;
for(;;) {
Expand All @@ -65,10 +83,64 @@ constexpr size_t nb_necklaces(unsigned a, unsigned k) {
return res / k;
}

// Bit twiddling operation to reverse bits in a word. Used for reverse
// complementation when the alphabet is a power of 2 (alpha == 2 or 4). Only
// defined on unsigned word types.
//
// Checkered mask. cmask<uint16_t, 1> is every other bit on (0x55).
// cmask<uint16_t,2> is two bits one, two bits off (0x33). Etc.
template<typename U, int len, int l = sizeof(U) * 8 / (2 * len)>
struct cmask {
static constexpr
std::enable_if<std::is_unsigned<U>::value && std::is_integral<U>::value, U>::type
v = (cmask<U, len, l - 1>::v << (2 * len)) | (((U)1 << len) - 1);
};

// When len is half of the word size, shifting by (2 * len) is undefined
// behavior. Fix it here.
template<typename U, int l>
struct cmask<U, sizeof(U) * 4, l> {
static constexpr
std::enable_if<std::is_unsigned<U>::value && std::is_integral<U>::value, U>::type
v = (((U)1 << (sizeof(U) * 4)) - 1);
};

// Base case, when l = 0, start with empty (0) word.
template<typename U, int len>
struct cmask<U, len, 0> {
static constexpr
std::enable_if<std::is_unsigned<U>::value && std::is_integral<U>::value, U>::type
v = 0;
};

template<typename U, unsigned alpha>
inline
std::enable_if<std::is_unsigned<U>::value && std::is_integral<U>::value, U>::type
word_reverse_complement(U w) {
if constexpr (alpha == 2)
w = ((w >> 1) & cmask<U, 1 >::v) | ((w & cmask<U, 1 >::v) << 1);
w = ((w >> 2) & cmask<U, 2 >::v) | ((w & cmask<U, 2 >::v) << 2);
w = ((w >> 4) & cmask<U, 4 >::v) | ((w & cmask<U, 4 >::v) << 4);
if constexpr (sizeof(U) >= 2)
w = ((w >> 8) & cmask<U, 8 >::v) | ((w & cmask<U, 8 >::v) << 8);
if constexpr (sizeof(U) >= 4)
w = ((w >> 16) & cmask<U, 16>::v) | ((w & cmask<U, 16>::v) << 16);
if constexpr (sizeof(U) >= 8)
w = ((w >> 32) & cmask<U, 32>::v) | ((w & cmask<U, 32>::v) << 32);
if constexpr (sizeof(U) >= 16)
w = ((w >> 64) & cmask<U, 64>::v) | ((w & cmask<U, 64>::v) << 64);
// return ~w;
return ((U)-1) - w;
}

template<unsigned int k_, unsigned int alpha_>
struct mer_op_type {
// typedef mer_type mer_t;
typedef typename optimal_int<log2ak(alpha_, k_), uint8_t, uint16_t, uint32_t, uint64_t>::type mer_t;
constexpr static unsigned int ak_bits = log2ak(alpha_, k_);
typedef typename optimal_int<ak_bits, uint8_t, uint16_t, uint32_t, uint64_t, __uint128_t>::type mer_t;

// Skip many program if encoding k takes too many bits
constexpr static unsigned int max_bits = 34;

constexpr static unsigned int k = k_;
constexpr static unsigned int alpha = alpha_;
Expand Down Expand Up @@ -161,18 +233,109 @@ struct mer_op_type {
return w;
}

static mer_t reverse_comp(const mer_t m) {
mer_t res = 0;
mer_t left = m;
for(unsigned int i = 0; i < k; ++i, left /= alpha) {
res = (res * alpha) + (alpha - 1 - (left % alpha));
inline static mer_t reverse_comp(const mer_t m) {
// Optimize for binary and DNA alphabet with bit twiddling
if constexpr (alpha == 2 || alpha == 4) {
const mer_t wc = word_reverse_complement<mer_t,alpha>(m);
constexpr unsigned shift = (8 * sizeof(mer_t) - (alpha/2)*k);
return wc >> shift;
} else {
// For general alphabets, looping algorithm
mer_t res = 0;
mer_t left = m;
for(unsigned int i = 0; i < k; ++i, left /= alpha)
res = (res * alpha) + (alpha - 1 - (left % alpha));
return res;
}
return res;
}

static mer_t canonical(const mer_t m) {
return std::min(m, reverse_comp(m));
}
};


template<unsigned int k_, unsigned int alpha_>
struct amer_type {
typedef mer_op_type<k_, alpha_> mer_ops;
typedef mer_ops::mer_t mer_t;
mer_t val;

amer_type() = default;
amer_type(mer_t x) : val(x) {}
amer_type(const amer_type& rhs) : val(rhs.val) {}
static amer_type homopolymer(const mer_t base) { return mer_ops::homopolymer(base); }

inline amer_type lb() const { return mer_ops::lb(val); }
inline amer_type rb() const { return mer_ops::rb(val); }
inline amer_type nmer() const { return mer_ops::nmer(val); }
inline amer_type nmer(const mer_t base) const { return mer_ops::nmer(val, base); }
inline amer_type pmer() const { return mer_ops::pmer(val); }
inline amer_type pmer(const mer_t base) const { return mer_ops::pmer(val, base); }
inline amer_type fmove() const { return mer_ops::fmove(val); }
inline amer_type rfmove() const { return mer_ops::rfmove(val); }
inline bool are_lc(const amer_type& rhs) const { return mer_ops::are_lc(val, rhs.val); }
inline bool ar_rc(const amer_type& rhs) const { return mer_ops::are_rc(val, rhs.val); }
inline amer_type lc(const mer_t base) const { return mer_ops::lc(val, base); }
inline amer_type rc(const mer_t base) const { return mer_ops::rc(val, base); }
inline bool is_homopolymer() const { return mer_ops::is_homopolymer(val); }
inline bool is_homopolymer_fm() const { return mer_ops::is_homopolymer_fm(val); }
inline mer_t weight() const { return mer_ops::weight(val); }
inline amer_type reverse_comp() const { return mer_ops::reverse_comp(val); }
inline amer_type canonical() const { return mer_ops::canonical(val); }

struct mer_rc_pair {
amer_type mer, rc;
mer_rc_pair(const amer_type& m)
: mer(m)
, rc(m.reverse_comp())
{}
mer_rc_pair& operator++() {
++mer.val;
rc.val -= mer_ops::nb_fmoves;
return *this;
}
mer_rc_pair& operator--() {
--mer;
rc += mer_ops::nb_fmoves;
return *this;
}
};

};

template<unsigned int k_, unsigned int alpha_>
std::ostream& operator<<(std::ostream& os, const amer_type<k_, alpha_>& m) {
return os << (uint64_t)m.val;
}

template<unsigned int k_, unsigned int alpha_>
inline bool operator==(const amer_type<k_, alpha_>& x, const amer_type<k_, alpha_>& y) {
return x.val == y.val;
}

template<unsigned int k_, unsigned int alpha_>
inline bool operator!=(const amer_type<k_, alpha_>& x, const amer_type<k_, alpha_>& y) {
return x.val != y.val;
}

template<unsigned int k_, unsigned int alpha_>
inline bool operator<(const amer_type<k_, alpha_>& x, const amer_type<k_, alpha_>& y) {
return x.val < y.val;
}


template<unsigned int k_, unsigned int alpha_>
struct std::hash<amer_type<k_, alpha_>> {
typedef amer_type<k_, alpha_> amer_t;
std::size_t operator()(const amer_t& x) const noexcept {
return std::hash<typename amer_t::mer_t>{}(x.val);
}
};

template<unsigned int k_, unsigned int alpha_>
inline std::ostream& operator<<(std::ostream& os, const typename amer_type<k_, alpha_>::mer_rc_pair& pair) {
return os << '(' << pair.mer << ',' << pair.rc << ')';
}

#endif // MER_OP_H
9 changes: 7 additions & 2 deletions misc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,17 @@ std::vector<mer_type> mds_from_file(const char* path_mds, bool sort = true) {
}

template<typename C>
C get_mds(const char* path_mds, std::vector<const char*> args) {
C res;
void get_mds(const char* path_mds, std::vector<const char*> args, C& res) {
if(path_mds != nullptr && path_mds[0] != '\0')
mds_read_to(path_mds, res);
if(args.size() > 0)
mds_parse_to(args, res);
}

template<typename C>
inline C get_mds(const char* path_mds, std::vector<const char*> args) {
C res;
get_mds(path_mds, args, res);
return res;
}

Expand Down
Loading

0 comments on commit 11601b1

Please sign in to comment.