Skip to content

Commit

Permalink
Greedy canonicalization optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
gmarcais committed Jan 21, 2024
1 parent 1391a26 commit dfa64bd
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 2 deletions.
2 changes: 1 addition & 1 deletion Tupfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ COMMON_SRCS = backtrace.cc common.cc sequence.cc
PROGS = traverse_comp greedy_mds mdss2dot comp2rankdot
PROGS += fms2mds optimize_rem_path_len mykkeltveit_set find_longest_path
PROGS += champarnaud_set sketch_components syncmer_set syncmer_sketch frac_set
PROGS += create_seed sketch_histo old_champarnaud_set
PROGS += create_seed sketch_histo old_champarnaud_set opt_canon
run ./rules.sh $(PROGS)
83 changes: 83 additions & 0 deletions opt_canon.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#include <iostream>
#include <unordered_set>

#include "misc.hpp"
#include "common.hpp"
#include "tarjan_scc.hpp"

#include "opt_canon.hpp"

#ifndef K
#error Must define k-mer length K
#endif

#ifndef ALPHA
#error Must define alphabet length ALPHA
#endif

#include "mer_op.hpp"

typedef mer_op_type<K, ALPHA> mer_ops;
typedef mer_ops::mer_t mer_t;


struct is_in_set {
const std::unordered_set<mer_t>& set;
is_in_set(const std::unordered_set<mer_t>& s) : set(s) {}
bool operator()(mer_t m) const { return set.find(m) != set.cend(); }
};

struct can_is_in_set {
const std::unordered_set<mer_t>& set;
can_is_in_set(const std::unordered_set<mer_t>& s) : set(s) {}
bool operator()(mer_t m) const { return set.find(mer_ops::canonical(m)) != set.cend(); }
};

struct is_in_union {
const std::unordered_set<mer_t>& set;
is_in_union(const std::unordered_set<mer_t>& s) : set(s) {}
bool operator()(mer_t m) const { return set.find(m) != set.cend() || set.find(mer_ops::reverse_comp(m)) != set.cend(); }
};

struct is_in_opt {
const std::unordered_set<mer_t>& set1;
const std::unordered_set<mer_t>& set2;
is_in_opt(const std::unordered_set<mer_t>& s1, const std::unordered_set<mer_t>& s2)
: set1(s1)
, set2(s2)
{}
bool operator()(mer_t m) const {
return (set1.find(m) != set1.cend() || set1.find(mer_ops::reverse_comp(m)) != set1.cend()) && \
(set2.find(m) == set2.end());
}
};

int main(int argc, char* argv[]) {
opt_canon args(argc, argv);
const auto mer_set = get_mds<std::unordered_set<mer_t>>(args.sketch_file_arg, args.sketch_arg);

tarjan_scc<mer_ops> comp_scc;
const is_in_set set_fn(mer_set);
const can_is_in_set can_fn(mer_set);
const auto counts_orig = comp_scc.scc_counts(set_fn);
std::cout << counts_orig.first << ' ' << counts_orig.second << '\n';

std::unordered_set<mer_t> remove;

// Greedily create the remove set. Should compare the SCCs themselves, not
// just their number and size. Sufficient for now....
for(const auto m : mer_set) {
if(can_fn(m)) continue; // Must be a super-set of canonical
// Try adding m to remove. If increase SCCs, don't keep it
remove.insert(m);
const is_in_opt opt(mer_set, remove);
const auto counts = comp_scc.scc_counts(opt);
if(counts.first > counts_orig.first || counts.second > counts_orig.second)
remove.erase(m);
}

std::cout << "orig " << mer_set.size() << ": " << joinT<size_t>(mer_set, ',') << '\n';
std::cout << "remove " << remove.size() << ": " << joinT<size_t>(remove, ',') << '\n';

return EXIT_SUCCESS;
}
11 changes: 11 additions & 0 deletions opt_canon.yaggo
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
purpose "Find set one less than union"

option('f', 'sketch-file') {
description 'File with sketch mer set'
c_string; typestr 'path'
}

arg('sketch') {
description 'k-mers'
c_string; multiple
}
6 changes: 5 additions & 1 deletion tarjan_scc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,17 @@ struct tarjan_scc {
}

template<typename Fn, typename E3>
std::pair<mer_t, mer_t> scc_counts(Fn fn, E3 new_visit = noprogress) {
std::pair<mer_t, mer_t> scc_counts(Fn fn, E3 new_visit) {
mer_t nb_scc = 0, nb_mers = 0;
auto new_scc = [&nb_scc](mer_t index) { ++nb_scc; };
auto new_mer = [&nb_mers](mer_t m) { ++nb_mers; };
scc_iterate(fn, new_scc, new_mer, new_visit);
return std::make_pair(nb_scc, nb_mers);
}
template<typename Fn>
inline std::pair<mer_t, mer_t> scc_counts(Fn fn) {
return scc_counts(fn, noprogress);
}

template<typename Fn, typename E3>
std::pair<mer_t, mer_t> scc_append(Fn fn, std::vector<mer_t>& mers, E3 new_visit = noprogress) {
Expand Down

0 comments on commit dfa64bd

Please sign in to comment.