diff --git a/Tupfile b/Tupfile index eb5222b..76c3d4d 100644 --- a/Tupfile +++ b/Tupfile @@ -17,5 +17,5 @@ COMMON_SRCS = backtrace.cc common.cc sequence.cc PROGS = traverse_comp greedy_mds mdss2dot comp2rankdot PROGS += fms2mds optimize_rem_path_len mykkeltveit_set find_longest_path PROGS += champarnaud_set sketch_components syncmer_set syncmer_sketch frac_set -PROGS += create_seed sketch_histo old_champarnaud_set +PROGS += create_seed sketch_histo old_champarnaud_set opt_canon run ./rules.sh $(PROGS) diff --git a/opt_canon.cc b/opt_canon.cc new file mode 100644 index 0000000..8320d00 --- /dev/null +++ b/opt_canon.cc @@ -0,0 +1,83 @@ +#include +#include + +#include "misc.hpp" +#include "common.hpp" +#include "tarjan_scc.hpp" + +#include "opt_canon.hpp" + +#ifndef K + #error Must define k-mer length K +#endif + +#ifndef ALPHA + #error Must define alphabet length ALPHA +#endif + +#include "mer_op.hpp" + +typedef mer_op_type mer_ops; +typedef mer_ops::mer_t mer_t; + + +struct is_in_set { + const std::unordered_set& set; + is_in_set(const std::unordered_set& s) : set(s) {} + bool operator()(mer_t m) const { return set.find(m) != set.cend(); } +}; + +struct can_is_in_set { + const std::unordered_set& set; + can_is_in_set(const std::unordered_set& s) : set(s) {} + bool operator()(mer_t m) const { return set.find(mer_ops::canonical(m)) != set.cend(); } +}; + +struct is_in_union { + const std::unordered_set& set; + is_in_union(const std::unordered_set& s) : set(s) {} + bool operator()(mer_t m) const { return set.find(m) != set.cend() || set.find(mer_ops::reverse_comp(m)) != set.cend(); } +}; + +struct is_in_opt { + const std::unordered_set& set1; + const std::unordered_set& set2; + is_in_opt(const std::unordered_set& s1, const std::unordered_set& s2) + : set1(s1) + , set2(s2) + {} + bool operator()(mer_t m) const { + return (set1.find(m) != set1.cend() || set1.find(mer_ops::reverse_comp(m)) != set1.cend()) && \ + (set2.find(m) == set2.end()); + } +}; + +int main(int argc, char* argv[]) { + opt_canon args(argc, argv); + const auto mer_set = get_mds>(args.sketch_file_arg, args.sketch_arg); + + tarjan_scc comp_scc; + const is_in_set set_fn(mer_set); + const can_is_in_set can_fn(mer_set); + const auto counts_orig = comp_scc.scc_counts(set_fn); + std::cout << counts_orig.first << ' ' << counts_orig.second << '\n'; + + std::unordered_set remove; + + // Greedily create the remove set. Should compare the SCCs themselves, not + // just their number and size. Sufficient for now.... + for(const auto m : mer_set) { + if(can_fn(m)) continue; // Must be a super-set of canonical + // Try adding m to remove. If increase SCCs, don't keep it + remove.insert(m); + const is_in_opt opt(mer_set, remove); + const auto counts = comp_scc.scc_counts(opt); + if(counts.first > counts_orig.first || counts.second > counts_orig.second) + remove.erase(m); + } + + std::cout << "orig " << mer_set.size() << ": " << joinT(mer_set, ',') << '\n'; + std::cout << "remove " << remove.size() << ": " << joinT(remove, ',') << '\n'; + + return EXIT_SUCCESS; +} diff --git a/opt_canon.yaggo b/opt_canon.yaggo new file mode 100644 index 0000000..28134b0 --- /dev/null +++ b/opt_canon.yaggo @@ -0,0 +1,11 @@ +purpose "Find set one less than union" + +option('f', 'sketch-file') { + description 'File with sketch mer set' + c_string; typestr 'path' +} + +arg('sketch') { + description 'k-mers' + c_string; multiple +} diff --git a/tarjan_scc.hpp b/tarjan_scc.hpp index 68ffcc4..b4c015d 100644 --- a/tarjan_scc.hpp +++ b/tarjan_scc.hpp @@ -53,13 +53,17 @@ struct tarjan_scc { } template - std::pair scc_counts(Fn fn, E3 new_visit = noprogress) { + std::pair scc_counts(Fn fn, E3 new_visit) { mer_t nb_scc = 0, nb_mers = 0; auto new_scc = [&nb_scc](mer_t index) { ++nb_scc; }; auto new_mer = [&nb_mers](mer_t m) { ++nb_mers; }; scc_iterate(fn, new_scc, new_mer, new_visit); return std::make_pair(nb_scc, nb_mers); } + template + inline std::pair scc_counts(Fn fn) { + return scc_counts(fn, noprogress); + } template std::pair scc_append(Fn fn, std::vector& mers, E3 new_visit = noprogress) {