diff --git a/src/gfa.cpp b/src/gfa.cpp index 435fd29dcd3..873e313be12 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -2,9 +2,12 @@ #include "utility.hpp" #include "path.hpp" #include +#include #include +//#define debug + namespace vg { using namespace std; @@ -18,7 +21,7 @@ static void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& rgfa_paths, bool rgfa_pline, bool use_w_lines) { - + // TODO: Support sorting nodes, paths, and/or edges for canonical output // TODO: Use a NamedNodeBackTranslation (or forward translation?) to properly round-trip GFA that has had to be chopped. @@ -40,8 +43,8 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& } out << "\n"; - //Compute the rGFA tags of given paths (todo: support non-zero ranks) - unordered_map> node_offsets; + //Compute the rGFA tags of given paths + unordered_map> node_offsets; for (const string& path_name : rgfa_paths) { path_handle_t path_handle = graph->get_path_handle(path_name); size_t offset = 0; @@ -72,10 +75,35 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& out << graph->get_sequence(h); auto it = node_offsets.find(node_id); if (it != node_offsets.end()) { - // add rGFA tags - out << "\t" << "SN:Z:" << graph->get_path_name(it->second.first) - << "\t" << "SO:i:" << it->second.second - << "\t" << "SR:i:0"; // todo: support non-zero ranks? + // hack off the rgfa stuff from the path, and move into tags + string path_name = graph->get_path_name(get<0>(it->second)); + string rgfa_tags; + string base_name; + if (get_rgfa_rank(path_name) >= 0) { + base_name = parse_rgfa_name_into_tags(graph->get_path_name(it->second.first), rgfa_tags); + } else { + // no tags, must be the rank-0 reference + base_name = path_name; + rgfa_tags = "SR:i:0"; + } + // hack off the subrange offset (and add it to SO) + PathSense sense; + string sample, locus; + size_t haplotype, phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(base_name, sense, sample, locus, haplotype, phase_block, subrange); + int64_t base_offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; + base_name = PathMetadata::create_path_name(sense, sample, locus, haplotype, phase_block, + PathMetadata::NO_SUBRANGE); + // add rGFA tags + out << "\t" << "SN:Z:" << base_name + << "\t" << "SO:i:" << (base_offset + it->second.second); + if (subrange != PathMetadata::NO_SUBRANGE) { + // todo : assert this doesn't happen (at least off-reference) + assert(subrange.second > subrange.first); + out << "\t" << "SL:i" << to_string(subrange.second - subrange.first); + } + out << "\t" << rgfa_tags; } out << "\n"; // Writing `std::endl` would flush the buffer. return true; @@ -106,6 +134,12 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& // Paths as P-lines for (const path_handle_t& h : path_handles) { auto path_name = graph->get_path_name(h); + if (get_rgfa_rank(path_name) > 0) { + if (!rgfa_paths.empty()) { + // the path was put into tags, no reason to deal with it here + continue; + } + } if (rgfa_pline || !rgfa_paths.count(path_name)) { if (graph->get_sense(h) != PathSense::REFERENCE && reference_samples.count(graph->get_sample_name(h))) { // We have a mix of reference and non-reference paths on the same sample which GFA can't handle. @@ -141,6 +175,13 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& { unordered_map, size_t> last_phase_block_end; for (const path_handle_t& h : w_line_paths) { + string path_name = graph->get_path_name(h); + if (get_rgfa_rank(path_name) > 0) { + if (!rgfa_paths.empty()) { + // the path was put into tags, no reason to deal with it here + continue; + } + } write_w_line(graph, out, h, last_phase_block_end); } } @@ -226,7 +267,7 @@ void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle_t path if (phase_block_end_cursor != 0) { if (start_offset != 0) { // TODO: Work out a way to support phase blocks and subranges at the same time. - cerr << "[gfa] error: cannot write multiple phase blocks on a sample, haplotyope, and contig in GFA format" + cerr << "[gfa] error: cannot write multiple phase blocks on a sample, haplotype, and contig in GFA format" << " when paths already have subranges. Fix path " << graph->get_path_name(path_handle) << endl; exit(1); } @@ -247,4 +288,986 @@ void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle_t path out << "\n"; } +int get_rgfa_rank(const string& path_name) { + int rank = -1; + + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + + size_t pos = path_locus.rfind(":SR:i:"); + if (pos != string::npos && path_locus.length() - pos >= 6) { + pos += 6; + size_t pos2 = path_locus.find(":", pos); + size_t len = pos2 == string::npos ? pos2 : pos2 - pos; + string rank_string = path_locus.substr(pos, len); + rank = parse(rank_string); + } + return rank; +} + +string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subrange_t& rgfa_subrange, + const string& rgfa_sample, + const string& start_parent_path_name, + int64_t start_parent_offset, + int64_t start_parent_node_id, + bool start_parent_node_reversed, + const string& end_parent_path_name, + int64_t end_parent_offset, + int64_t end_parent_node_id, + bool end_parent_node_reversed) { + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + + // we're going to store absolutely everything in the locus name, including the sample name (behind SN:Z:) + assert(path_locus != PathMetadata::NO_LOCUS_NAME); + string base_name; + if (path_sample != PathMetadata::NO_SAMPLE_NAME) { + base_name = "SN:Z:" + path_sample; + } + // the contig name will be behind SC... + base_name += ":SC:Z:" + path_locus; + + // and we also load in our rGFA rank + base_name += ":SR:i:" + std::to_string(rgfa_rank); + + // start parent parent path (swap # for % as hack to not much up parent paths formatting) + // turns out we can't embed []'s either, so swap those too + if (!start_parent_path_name.empty()) { + string spn = start_parent_path_name; + std::replace(spn.begin(), spn.end(), '#', '%'); + std::replace(spn.begin(), spn.end(), '[', '('); + std::replace(spn.begin(), spn.end(), ']', ')'); + base_name += ":SPP:Z:" + spn; + if (start_parent_offset >= 0) { + base_name += ":SPO:i:" + to_string(start_parent_offset); + } + base_name += ":SPN:Z:" + string(start_parent_node_reversed ? "<" : ">") + to_string(start_parent_node_id); + } + // end parent parent path (swap # for % as hack to not much up parent paths formatting) + // turns out we can't embed []'s either, so swap those too + if (!end_parent_path_name.empty()) { + string spn = end_parent_path_name; + std::replace(spn.begin(), spn.end(), '#', '%'); + std::replace(spn.begin(), spn.end(), '[', '('); + std::replace(spn.begin(), spn.end(), ']', ')'); + base_name += ":EPP:Z:" + spn; + if (end_parent_offset >= 0) { + base_name += ":EPO:i:" + to_string(end_parent_offset); + } + base_name += ":EPN:Z:" + string(end_parent_node_reversed ? "<" : ">") + to_string(end_parent_node_id); + } + + // and return the final path, with sample/locus/rgfa-rank embedded in locus + // (as it's a reference path, we alsos strip out the phase block) + return PathMetadata::create_path_name(PathSense::REFERENCE, rgfa_sample, base_name, path_haplotype, + PathMetadata::NO_PHASE_BLOCK, rgfa_subrange); +} + +string parse_rgfa_path_name(const string& path_name, int* rgfa_rank, + string* rgfa_sample, + string* start_parent_path_name, + int64_t* start_parent_offset, + int64_t* start_parent_node_id, + bool* start_parent_node_reversed, + string* end_parent_path_name, + int64_t* end_parent_offset, + int64_t* end_parent_node_id, + bool* end_parent_node_reversed) { + + + // begin by parsing the # structure of the path + // note that all the rgfa stuff we're pulling out here will be embedded in the "locus" + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + + if (rgfa_sample) { + *rgfa_sample = path_sample; + } + + string sample_name; + string contig_name; + + // now we parse the locus which is just a list of tags sepearted by : + // todo (again), this is too wordy and should be compacted + vector toks = split_delims(path_locus, ":"); + assert(toks.size() % 3 == 0); + for (int64_t i = 0; i < toks.size(); i+=3) { + if (toks[i] == "SN") { + assert(toks[i+1] == "Z"); + sample_name = toks[i+2]; + } else if (toks[i] == "SC") { + assert(toks[i+1] == "Z"); + contig_name = toks[i+2]; + } else if (toks[i] == "SR") { + assert(toks[i+1] == "i"); + if (rgfa_rank) { + *rgfa_rank = parse(toks[i+2]); + } + } else if (toks[i] == "SPP") { + assert(toks[i+1] == "Z"); + if (start_parent_path_name) { + *start_parent_path_name = toks[i+2]; + std::replace(start_parent_path_name->begin(), start_parent_path_name->end(), '%', '#'); + std::replace(start_parent_path_name->begin(), start_parent_path_name->end(), '(', '['); + std::replace(start_parent_path_name->begin(), start_parent_path_name->end(), ')', ']'); + } + } else if (toks[i] == "SPO") { + assert(toks[i+1] == "i"); + if (start_parent_offset) { + *start_parent_offset = parse(toks[i+2]); + } + } else if (toks[i] == "SPN") { + assert(toks[i+1] == "Z"); + if (start_parent_node_id) { + *start_parent_node_id = parse(toks[i+2].substr(1)); + } + if (start_parent_node_reversed) { + *start_parent_node_reversed = toks[i+2][0] == '<'; + } + } else if (toks[i] == "EPP") { + assert(toks[i+1] == "Z"); + if (end_parent_path_name) { + *end_parent_path_name = toks[i+2]; + std::replace(end_parent_path_name->begin(), end_parent_path_name->end(), '%', '#'); + std::replace(end_parent_path_name->begin(), end_parent_path_name->end(), '(', '['); + std::replace(end_parent_path_name->begin(), end_parent_path_name->end(), ')', ']'); + } + } else if (toks[i] == "EPO") { + assert(toks[i+1] == "i"); + if (end_parent_offset) { + *end_parent_offset = parse(toks[i+2]); + } + } else if (toks[i] == "EPN") { + assert(toks[i+1] == "Z"); + if (end_parent_node_id) { + *end_parent_node_id = parse(toks[i+2].substr(1)); + } + if (end_parent_node_reversed) { + *end_parent_node_reversed = toks[i+2][0] == '<'; + } + } else { + assert(false); + } + } + + if (path_sense == PathSense::REFERENCE && sample_name.empty()) { + // the embedded path never actually had a sample, so we need to flip back to generic + path_sense = PathSense::GENERIC; + path_haplotype = PathMetadata::NO_HAPLOTYPE; + path_phase_block = PathMetadata::NO_PHASE_BLOCK; + } + + // reconstruct the vg path with rgfa stuf stripped from locus + return PathMetadata::create_path_name(path_sense, + sample_name, + contig_name, + path_haplotype, + path_phase_block, + path_subrange); + +} + +string parse_rgfa_name_into_tags(const string& path_name, string& rgfa_tags) { + int rgfa_rank; + string rgfa_sample; + string start_parent_rgfa_path_name; + int64_t start_parent_offset; + int64_t start_parent_node_id; + bool start_parent_node_reversed; + string end_parent_rgfa_path_name; + int64_t end_parent_offset; + int64_t end_parent_node_id; + bool end_parent_node_reversed; + string vg_path_name = parse_rgfa_path_name(path_name, &rgfa_rank, + &rgfa_sample, + &start_parent_rgfa_path_name, + &start_parent_offset, + &start_parent_node_id, + &start_parent_node_reversed, + &end_parent_rgfa_path_name, + &end_parent_offset, + &end_parent_node_id, + &end_parent_node_reversed); + rgfa_tags += "SR:i:" + to_string(rgfa_rank); + if (!start_parent_rgfa_path_name.empty()) { + rgfa_tags += "\tSPP:Z:" + start_parent_rgfa_path_name + + "\tSPO:i:" + to_string(start_parent_offset) + + "\tSPN:Z:" + (start_parent_node_reversed ? "<" : ">") + to_string(start_parent_node_id); + } + if (!end_parent_rgfa_path_name.empty()) { + rgfa_tags += "\tEPP:Z:" + end_parent_rgfa_path_name + + "\tEPO:i:" + to_string(end_parent_offset) + + "\tEPN:Z:" + (end_parent_node_reversed ? "<" : ">") + to_string(end_parent_node_id); + } + + return vg_path_name; +} + + + +string strip_rgfa_path_name(const string& path_name) { + + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + + assert(path_locus != PathMetadata::NO_LOCUS_NAME); + + size_t sr_pos = path_locus.rfind(":SR:i:"); + if (sr_pos != string::npos && path_locus.length() - sr_pos >= 6) { + size_t sn_pos = path_locus.rfind("SN:Z:", sr_pos - 1); + assert(sn_pos != string::npos); + + string orig_sample; + if (sn_pos > 0) { + orig_sample = path_locus.substr(0, sn_pos - 1); + } + string orig_locus = path_locus.substr(sn_pos + 5, sr_pos - sn_pos - 5); + + // todo: recover path sense / haploblock? + if (orig_sample.empty()) { + path_sense = PathSense::GENERIC; + path_haplotype = PathMetadata::NO_HAPLOTYPE; + } + return PathMetadata::create_path_name(path_sense, orig_sample, orig_locus, + path_haplotype, path_phase_block, path_subrange); + } + return path_name; +} + +void clamp_path_subrange(string& path_name, int64_t start, int64_t end) { + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + + if (path_subrange == PathMetadata::NO_SUBRANGE) { + path_subrange.first = start; + path_subrange.second = end; + } else { + assert(end <= path_subrange.second - path_subrange.first && end > start); + path_subrange.first += start; + path_subrange.second = path_subrange.first + (end - start); + } + path_name = PathMetadata::create_path_name(path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); +} + +void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, + SnarlManager* snarl_manager, + const unordered_set& reference_paths, + int64_t minimum_length, + const string& rgfa_sample_name, + const unordered_map>>& preferred_intervals){ + + // for sanity's sake, we don't want to ever support multiple rgfa covers, so start by + // deleting all existing rgfa fragments (except for rank 0 reference paths, of course) + vector existing_cover; + graph->for_each_path_handle([&](path_handle_t path_handle) { + if (get_rgfa_rank(graph->get_path_name(path_handle)) > 0) { + existing_cover.push_back(path_handle); + assert(!reference_paths.count(path_handle)); + } + }); + for (path_handle_t path_handle : existing_cover) { + graph->destroy_path(path_handle); + } + + // we use the path traversal finder for everything + // (even gbwt haplotypes, as we're using the path handle interface) + PathTraversalFinder path_trav_finder(*graph, *snarl_manager); + + // we collect the rgfa cover in parallel as a list of path fragments + size_t thread_count = get_thread_count(); + vector> thread_covers(thread_count); + + // we process top-level snarls in parallel + snarl_manager->for_each_top_level_snarl_parallel([&](const Snarl* snarl) { + // per-thread output + // each fragment is a rank and vector of steps, the cover is a list of fragments + // TODO: we can store just a first step and count instead of every fragment + // The last two numbers are the indexes of the start and end parent fragments (ie values in cover_not_to_fragment) + vector& cover_fragments = thread_covers.at(omp_get_thread_num()); + // we also index the fragments by their node ids for fast lookups of what's covered by what + // the value here is an index in the above vector + unordered_map cover_node_to_fragment; + + vector queue = {snarl}; + + while(!queue.empty()) { + const Snarl* cur_snarl = queue.back(); + queue.pop_back(); + + // get the snarl cover, writing to cover_nodes and cover_fragments + // note that we are single-threaded per top-level snarl, at least for now + // this is because parent snarls and child snarls can potentially cover the + // sname things + rgfa_snarl_cover(graph, + *cur_snarl, + path_trav_finder, + reference_paths, + minimum_length, + cover_fragments, + cover_node_to_fragment, + preferred_intervals); + + // recurse on the children + const vector& children = snarl_manager->children_of(cur_snarl); + for (const Snarl* child_snarl : children) { + queue.push_back(child_snarl); + } + } + }); + + // merge up the thread covers + vector rgfa_fragments = std::move(thread_covers.at(0)); + for (size_t t = 1; t < thread_count; ++t) { + rgfa_fragments.reserve(rgfa_fragments.size() + thread_covers.at(t).size()); + // adjust the offsets into each vector + for (auto& other_frag : thread_covers.at(t)) { + other_frag.start_parent_idx += rgfa_fragments.size(); + other_frag.end_parent_idx += rgfa_fragments.size(); + } + std::move(thread_covers.at(t).begin(), thread_covers.at(t).end(), std::back_inserter(rgfa_fragments)); + } + thread_covers.clear(); + + + // we don't have a path position interface, and even if we did we probably wouldn't have it on every path + // so to keep running time linear, we need to index the fragments so their offsets can be computed in one scan + // begin by sorting by path + unordered_map> path_to_fragments; + for (size_t i = 0; i get_path_handle_of_step(rgfa_fragment.steps.front()); + path_to_fragments[path_handle].push_back(i); + } + + unordered_map step_to_pos; + + for (const auto& path_fragments : path_to_fragments) { + const path_handle_t& path_handle = path_fragments.first; + string path_name = graph->get_path_name(path_handle); + + const vector& fragments = get<1>(path_fragments); + + // for each path, start by finding the positional offset of all relevant steps in the path by brute-force scann + int64_t set_count = 0; + for (const int64_t& frag_idx : fragments) { + const vector& rgfa_fragment = rgfa_fragments.at(frag_idx).steps; + for (const step_handle_t& step: rgfa_fragment) { + step_to_pos[step] = -1; + assert(graph->get_path_handle_of_step(step) == path_handle); + ++set_count; + } + } + size_t pos = 0; + graph->for_each_step_in_path(path_handle, [&](const step_handle_t& step_handle) { + if (step_to_pos.count(step_handle)) { + step_to_pos[step_handle] = pos; + --set_count; + } + handle_t handle = graph->get_handle_of_step(step_handle); + pos += graph->get_length(handle); + return true; + }); + //assert(set_count == 0); + } + + for (const auto& path_fragments : path_to_fragments) { + const path_handle_t& path_handle = path_fragments.first; + string path_name = graph->get_path_name(path_handle); + + const vector& fragments = get<1>(path_fragments); + + // second pass to make the path fragments, now that we know the positional offsets of their endpoints + for (const int64_t frag_idx : fragments) { + const RGFAFragment& rgfa_fragment = rgfa_fragments.at(frag_idx); + // don't do anything for rank-0, which we only kept to help with metadata for other fragments + // todo: can we upstream this check? + if (rgfa_fragment.rank == 0) { + continue; + } + + size_t rgfa_frag_pos = step_to_pos[rgfa_fragment.steps.front()]; + size_t rgfa_frag_length = 0; + for (const step_handle_t& step : rgfa_fragment.steps) { + rgfa_frag_length += graph->get_length(graph->get_handle_of_step(step)); + } + subrange_t rgfa_frag_subrange = graph->get_subrange(path_handle); + rgfa_frag_subrange.first = rgfa_frag_pos + (rgfa_frag_subrange != PathMetadata::NO_SUBRANGE ? rgfa_frag_subrange.first : 0); + rgfa_frag_subrange.second = rgfa_frag_subrange.first + rgfa_frag_length; + + + string start_parent_rgfa_path_name = ""; + int64_t start_parent_offset = -1; + int64_t start_parent_node_id = -1; + bool start_parent_node_reversed = false; + + // get information about the start parent + if (rgfa_fragment.start_parent_idx >= 0) { + const RGFAFragment& start_parent_fragment = rgfa_fragments.at(rgfa_fragment.start_parent_idx); + path_handle_t start_parent_path_handle = graph->get_path_handle_of_step(start_parent_fragment.steps.front()); + start_parent_rgfa_path_name = graph->get_path_name(start_parent_path_handle); + start_parent_offset = step_to_pos.at(rgfa_fragment.start_parent_step); + subrange_t start_parent_subrange = PathMetadata::parse_subrange(start_parent_rgfa_path_name); + if (start_parent_subrange != PathMetadata::NO_SUBRANGE) { + // now we subset the path to just the fragment interval (which I think it more useful) + int64_t start_parent_range_start = step_to_pos.at(start_parent_fragment.steps.front()); + int64_t start_parent_range_end = start_parent_range_start + start_parent_fragment.length; + clamp_path_subrange(start_parent_rgfa_path_name, start_parent_range_start, start_parent_range_end); + start_parent_offset -= start_parent_range_start; + } + start_parent_node_id = graph->get_id(graph->get_handle_of_step(start_parent_fragment.start_parent_step)); + start_parent_node_reversed = graph->get_is_reverse(graph->get_handle_of_step(start_parent_fragment.start_parent_step)); + } + + string end_parent_rgfa_path_name = ""; + int64_t end_parent_offset = -1; + int64_t end_parent_node_id = -1; + bool end_parent_node_reversed = false; + + // get information about the end parent + if (rgfa_fragment.end_parent_idx >= 0) { + const RGFAFragment& end_parent_fragment = rgfa_fragments.at(rgfa_fragment.end_parent_idx); + path_handle_t end_parent_path_handle = graph->get_path_handle_of_step(end_parent_fragment.steps.front()); + end_parent_rgfa_path_name = graph->get_path_name(end_parent_path_handle); + assert(graph->get_path_handle_of_step(rgfa_fragment.end_parent_step) == end_parent_path_handle); + end_parent_offset = step_to_pos.at(rgfa_fragment.end_parent_step); + subrange_t end_parent_subrange = PathMetadata::parse_subrange(end_parent_rgfa_path_name); + if (end_parent_subrange != PathMetadata::NO_SUBRANGE) { + // now we subset the path to just the fragment interval (which I think it more useful) + int64_t end_parent_range_start = step_to_pos.at(end_parent_fragment.steps.front()); + int64_t end_parent_range_end = end_parent_range_start + end_parent_fragment.length; + clamp_path_subrange(end_parent_rgfa_path_name, end_parent_range_start, end_parent_range_end); + end_parent_offset -= end_parent_range_start; + } + end_parent_node_id = graph->get_id(graph->get_handle_of_step(end_parent_fragment.end_parent_step)); + end_parent_node_reversed = graph->get_is_reverse(graph->get_handle_of_step(end_parent_fragment.end_parent_step)); + } + + string rgfa_frag_name = create_rgfa_path_name(path_name, rgfa_fragment.rank, rgfa_frag_subrange, rgfa_sample_name, + start_parent_rgfa_path_name, + start_parent_offset, + start_parent_node_id, start_parent_node_reversed, + end_parent_rgfa_path_name, + end_parent_offset, + end_parent_node_id, end_parent_node_reversed); + +#ifdef debug +#pragma omp critical(cerr) + cerr << "making new rgfa fragment with name " << rgfa_frag_name << " and " << rgfa_fragment.steps.size() << " steps. subrange " + << rgfa_frag_subrange.first << "," << rgfa_frag_subrange.second << endl; +#endif + path_handle_t rgfa_fragment_handle = graph->create_path_handle(rgfa_frag_name); + for (const step_handle_t& step : rgfa_fragment.steps) { + graph->append_step(rgfa_fragment_handle, graph->get_handle_of_step(step)); + } + } + } + + // forwardize the graph + rgfa_forwardize_paths(graph, reference_paths); +} + +void rgfa_snarl_cover(const PathHandleGraph* graph, + const Snarl& snarl, + PathTraversalFinder& path_trav_finder, + const unordered_set& reference_paths, + int64_t minimum_length, + vector& cover_fragments, + unordered_map& cover_node_to_fragment, + const unordered_map>>& preferred_intervals) { + +#ifdef debug +#pragma omp critical(cerr) + cerr << "calling rgfa_snarl_cover on " << pb2json(snarl) << endl; +#endif + + // // start by finding the path traversals through the snarl + vector> travs; + { + pair, vector > > path_travs = path_trav_finder.find_path_traversals(snarl); + travs.reserve(path_travs.first.size()); + + // reduce protobuf usage by going back to vector of steps instead of keeping SnarlTraversals around + for (int64_t i = 0; i < path_travs.first.size(); ++i) { + string trav_path_name = graph->get_path_name(graph->get_path_handle_of_step(path_travs.second[i].first)); + if (get_rgfa_rank(trav_path_name) > 0) { + // we ignore existing (off-reference) rGFA paths + // todo: shoulgd there be better error handling? + cerr << "Warning : skipping existing rgfa traversal " << trav_path_name << endl; + continue; + } + bool reversed = false; + if (graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].first)) != snarl.start().backward()) { + reversed = true; + } + assert((graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].second)) != snarl.end().backward()) == reversed); + vector trav; + trav.reserve(path_travs.first[i].visit_size()); + bool done = false; + function visit_next_step = [&graph,&reversed](step_handle_t step_handle) { + return reversed ? graph->get_previous_step(step_handle) : graph->get_next_step(step_handle); + }; + for (step_handle_t step_handle = path_travs.second[i].first; !done; step_handle = visit_next_step(step_handle)) { + trav.push_back(step_handle); + if (step_handle == path_travs.second[i].second) { + done = true; + } + } + if (reversed) { + std::reverse(trav.begin(), trav.end()); + } + travs.push_back(trav); + } + } + + // find all reference paths through the snarl + map ref_paths; + for (int64_t i = 0; i < travs.size(); ++i) { + path_handle_t trav_path = graph->get_path_handle_of_step(travs[i].front()); + if (reference_paths.count(trav_path)) { + ref_paths[graph->get_path_name(trav_path)] = i; + } + } + + // note: checking both snarl endpoint here is actually necessary: if the reference path doesn't end in a tip, + // you can end up with a trivial snarl at its end which will crash on this test. + if (ref_paths.empty() && (!cover_node_to_fragment.count(snarl.start().node_id()) || !cover_node_to_fragment.count(snarl.end().node_id()))) { + // we're not nested in a reference snarl, and we have no reference path + // by the current logic, there's nothing to be done. + cerr << "[rgfa] warning: No referene path through snarl " + << pb2json(snarl) << ": unable to process for rGFA cover" << endl; + return; + } + + if (ref_paths.size() > 1) { + // And we could probably cope with this... but don't for now + cerr << "[rgfa] error: Mutiple reference path traversals found through snarl " + << pb2json(snarl) << endl; + } + + if (!ref_paths.empty()) { + // update the cover: note that we won't actually make a path out of this + // since we leave rank-0 paths the way they are, but it's handy to have + // a consistent representation for the cover while linking up the + // various nesting metadata. + int64_t ref_trav_length = 0; + vector& ref_trav = travs.at(ref_paths.begin()->second); + for (step_handle_t ref_step_handle : ref_trav) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(ref_step_handle)); + cover_node_to_fragment[node_id] = cover_fragments.size(); + ref_trav_length += graph->get_length(graph->get_handle_of_step(ref_step_handle)); + } + + // todo: check endpoints (and remove from ref_trav??) + RGFAFragment frag = { + 0, ref_trav, ref_trav_length, -1, -1, ref_trav.front(), ref_trav.back() + }; + cover_fragments.push_back(frag); + } + +#ifdef debug +#pragma omp critical(cerr) + cerr << "found " << travs.size() << " traversals including " << ref_paths.size() << " reference traversals" << endl; +#endif + + + // find all intervals within a snarl traversal that are completely uncovered. + // the returned intervals are open-ended. + function>(const vector&)> get_uncovered_intervals = [&](const vector& trav) { + vector> intervals; + int64_t start = -1; + unordered_set dupe_check; + for (size_t i = 0; i < trav.size(); ++i) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(trav[i])); + bool covered = cover_node_to_fragment.count(node_id); + // we break at dupes even if uncovered -- never want same id twice in an interval + bool dupe = !covered && dupe_check.count(node_id); + dupe_check.insert(node_id); + if (covered || dupe) { + if (start != -1) { + intervals.push_back(make_pair(start, i)); + } + start = dupe ? i : -1; + } else { + if (start == -1) { + start = i; + } + } + } + if (start != -1) { + intervals.push_back(make_pair(start, trav.size())); + } + return intervals; + }; + + // build an initial ranked list of candidate traversal fragments + vector, pair>>> ranked_trav_fragments; + for (int64_t trav_idx = 0; trav_idx < travs.size(); ++trav_idx) { + // todo: this map seems backwards? note really a big deal since + // we're only allowing one element + bool is_ref = false; + for (const auto& ref_trav : ref_paths) { + if (ref_trav.second == trav_idx) { + is_ref = true; + break; + } + } + if (is_ref) { + continue; + } + const vector& trav = travs.at(trav_idx); + vector> uncovered_intervals = get_uncovered_intervals(trav); + + for (const auto& uncovered_interval : uncovered_intervals) { + unordered_set cycle_check; + bool cyclic = false; + int64_t interval_length = 0; + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second && !cyclic; ++i) { + handle_t handle = graph->get_handle_of_step(trav[i]); + interval_length += graph->get_length(handle); + nid_t node_id = graph->get_id(handle); + if (cycle_check.count(node_id)) { + cyclic = true; + } else { + cycle_check.insert(node_id); + } + } + if (!cyclic && interval_length >= minimum_length) { + auto trav_stats = rgfa_traversal_stats(graph, trav, uncovered_interval); + ranked_trav_fragments.push_back(make_pair(trav_stats, make_pair(trav_idx, uncovered_interval))); + } + } + } + + // todo: typedef! + function, pair>>& s1, + const pair, pair>>& s2)> heap_comp = + [](const pair, pair>>& s1, + const pair, pair>>& s2) { + return rgfa_traversal_stats_less(s1.first, s2.first); + }; + + // put the fragments into a max heap + std::make_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); + + // now greedily pull out traversal intervals from the ranked list until none are left + while (!ranked_trav_fragments.empty()) { + + // get the best scoring (max) fragment from heap + auto best_stats_fragment = ranked_trav_fragments.front(); + std::pop_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); + ranked_trav_fragments.pop_back(); + + const vector& trav = travs.at(best_stats_fragment.second.first); + const pair& uncovered_interval = best_stats_fragment.second.second; + + // our traversal may have been partially covered by a different iteration, if so, we need to break it up + // and continue + vector> chopped_intervals; + int64_t cur_start = -1; + bool chopped = false; + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { + bool covered = cover_node_to_fragment.count(graph->get_id(graph->get_handle_of_step(trav[i]))); + if (!covered && cur_start == -1) { + cur_start = i; + } else if (covered) { + chopped = true; + if (cur_start != -1) { + chopped_intervals.push_back(make_pair(cur_start, i)); + cur_start = -1; + } + } + } + if (cur_start != -1) { + chopped_intervals.push_back(make_pair(cur_start, uncovered_interval.second)); + } + if (chopped) { + for (const pair& chopped_interval : chopped_intervals) { + int64_t chopped_trav_length = 0; + for (int64_t i = chopped_interval.first; i < chopped_interval.second; ++i) { + chopped_trav_length += graph->get_length(graph->get_handle_of_step(trav[i])); + } + if (chopped_trav_length >= minimum_length) { + auto chopped_stats = rgfa_traversal_stats(graph, trav, chopped_interval); + ranked_trav_fragments.push_back(make_pair(chopped_stats, make_pair(best_stats_fragment.second.first, chopped_interval))); + std::push_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); + } + } + continue; + } + + // we check the "backbone" interval that this interval is coming off + // since we've already covered the reference, then we know that this interval + // doesn't span the whole snarl including endpoints, so we can always afford + // to look back and ahead one + assert(uncovered_interval.first > 0 && uncovered_interval.second < trav.size()); + int64_t prev_frag_idx = cover_node_to_fragment.at(graph->get_id(graph->get_handle_of_step(trav[uncovered_interval.first - 1]))); + int64_t next_frag_idx = cover_node_to_fragment.at(graph->get_id(graph->get_handle_of_step(trav[uncovered_interval.second]))); + // todo: i'm not sure if/how minigraph treats these cases, where the anchors connect to different ranks + // also, can these be avoided entirely? + int64_t min_frag_idx = std::min(prev_frag_idx, next_frag_idx); + int64_t fragment_rank; + if (min_frag_idx == -1) { + fragment_rank = 1; + } else { + fragment_rank = cover_fragments.at(min_frag_idx).rank + 1; + } + + // now we need to find the steps on the parent path, in order to link back to its position + // todo: can we avoid this enumeration by keeping track of the parent directly from the beginning? + handle_t start_handle = graph->get_handle_of_step(trav[uncovered_interval.first - 1]); + vector start_handle_steps = graph->steps_of_handle(start_handle); + path_handle_t start_parent_path = graph->get_path_handle_of_step(cover_fragments.at(prev_frag_idx).steps.front()); + vector start_parent_step_indexes; + for (int64_t i = 0; i < start_handle_steps.size(); ++i) { + if (graph->get_path_handle_of_step(start_handle_steps[i]) == start_parent_path) { + start_parent_step_indexes.push_back(i); + } + } + if (start_parent_step_indexes.size() > 1) { + // fall back to brute force scan of the entire parent path + start_parent_step_indexes.clear(); + start_handle_steps.clear(); + for (const auto& parent_step : cover_fragments.at(prev_frag_idx).steps) { + if (graph->get_id(graph->get_handle_of_step(parent_step)) == graph->get_id(start_handle)) { + start_parent_step_indexes.push_back(start_handle_steps.size()); + start_handle_steps.push_back(parent_step); + // todo: break (but leave out for now to do below assertion) + } + } + } + assert(start_parent_step_indexes.size() == 1); + step_handle_t start_parent_step = start_handle_steps[start_parent_step_indexes[0]]; + +#ifdef debug +#pragma omp critical(cerr) + cerr << "adding step " << graph->get_path_name(start_parent_path) << " " << graph->get_id(graph->get_handle_of_step(start_parent_step)) << " as start parent for " << graph->get_path_name(graph->get_path_handle_of_step(trav.front())) << " starting at " << graph->get_id(graph->get_handle_of_step(trav.at(uncovered_interval.first))) << endl; +#endif + + handle_t end_handle = graph->get_handle_of_step(trav[uncovered_interval.second]); + vector end_handle_steps = graph->steps_of_handle(end_handle); + path_handle_t end_parent_path = graph->get_path_handle_of_step(cover_fragments.at(next_frag_idx).steps.front()); + vector end_parent_step_indexes; + for (int64_t i = 0; i < end_handle_steps.size(); ++i) { + if (graph->get_path_handle_of_step(end_handle_steps[i]) == end_parent_path) { + end_parent_step_indexes.push_back(i); + } + } + if (end_parent_step_indexes.size() > 1) { + // fall back to brute force scan of the entire parent path + end_parent_step_indexes.clear(); + end_handle_steps.clear(); + for (const auto& parent_step : cover_fragments.at(next_frag_idx).steps) { + if (graph->get_id(graph->get_handle_of_step(parent_step)) == graph->get_id(end_handle)) { + end_parent_step_indexes.push_back(end_handle_steps.size()); + end_handle_steps.push_back(parent_step); + // todo: break (but leave out for now to do below assertion) + } + } + } + assert(end_parent_step_indexes.size() == 1); + step_handle_t end_parent_step = end_handle_steps[end_parent_step_indexes[0]]; + + // update the cover + vector interval; + int64_t interval_length = 0; + interval.reserve(uncovered_interval.second - uncovered_interval.first); + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { + interval.push_back(trav[i]); + interval_length += graph->get_length(graph->get_handle_of_step(trav[i])); + cover_node_to_fragment[graph->get_id(graph->get_handle_of_step(trav[i]))] = cover_fragments.size(); + } + RGFAFragment frag = { + fragment_rank, std::move(interval), interval_length, + prev_frag_idx, next_frag_idx, + start_parent_step, end_parent_step + }; + +#ifdef debug +#pragma omp critical(cerr) +{ + cerr << "adding fragment path=" << graph->get_path_name(graph->get_path_handle_of_step(frag.steps.front())) << " rank=" << fragment_rank << " steps = "; + for (auto xx : frag.steps) { + cerr << graph->get_id(graph->get_handle_of_step(xx)) <<":" << graph->get_is_reverse(graph->get_handle_of_step(xx)) <<","; + } + cerr << " sps " << graph->get_id(graph->get_handle_of_step(start_parent_step)) <<":" << graph->get_is_reverse(graph->get_handle_of_step(start_parent_step)); + cerr << " eps " << graph->get_id(graph->get_handle_of_step(end_parent_step)) <<":" << graph->get_is_reverse(graph->get_handle_of_step(end_parent_step)); + cerr << endl; +} +#endif + cover_fragments.push_back(frag); + } +} + +tuple rgfa_traversal_stats(const PathHandleGraph* graph, + const vector& trav, + const pair& trav_fragment) { + path_handle_t path_handle = graph->get_path_handle_of_step(trav.front()); + int64_t support = 0; + int64_t reversed_steps = 0; + int64_t dupes = 0; + + for (int64_t i = trav_fragment.first; i < trav_fragment.second; ++i) { + const step_handle_t& step = trav[i]; + handle_t handle = graph->get_handle_of_step(step); + vector all_steps = graph->steps_of_handle(handle); + int64_t length = graph->get_length(handle); + support += length; + int64_t self_count = 0; + for (const step_handle_t& other_step : all_steps) { + path_handle_t step_path_handle = graph->get_path_handle_of_step(other_step); + if (step_path_handle == path_handle) { + ++self_count; + } else { + support += length; + } + } + if (self_count > 1) { + dupes += length * (self_count - 1); + } + if (i > 0 && graph->get_is_reverse(handle)) { + ++reversed_steps; + } + } + + return std::make_tuple(support, reversed_steps, dupes); +} + +bool rgfa_traversal_stats_less(const tuple& s1, const tuple& s2) { + // duplicates are a deal breaker, if one traversal has no duplicates and the other does, the former wins + if (get<2>(s1) > 0 && get<2>(s2) == 0) { + return true; + } else if (get<2>(s1) == 0 && get<2>(s2) > 0) { + return false; + } + + // then support + if (get<0>(s1) < get<0>(s2)) { + return true; + } else if (get<0>(s1) > get<0>(s2)) { + return false; + } + + // then duplicates (by value) + if (get<2>(s1) > get<2>(s2)) { + return true; + } else if (get<2>(s1) < get<2>(s2)) { + return false; + } + + // then switches + return get<1>(s1) > get<1>(s2); +} + +// copied pretty much verbatem from +// https://github.com/ComparativeGenomicsToolkit/hal2vg/blob/v1.1.2/clip-vg.cpp#L809-L880 +void rgfa_forwardize_paths(MutablePathMutableHandleGraph* graph, + const unordered_set& reference_paths) { + + unordered_map id_map; + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + if (reference_paths.count(path_handle) || get_rgfa_rank(path_name) >= 0) { + size_t fw_count = 0; + size_t total_steps = 0; + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = graph->get_handle_of_step(step_handle); + if (graph->get_is_reverse(handle)) { + handle_t flipped_handle = graph->create_handle(graph->get_sequence(handle)); + id_map[graph->get_id(flipped_handle)] = graph->get_id(handle); + graph->follow_edges(handle, true, [&](handle_t prev_handle) { + if (graph->get_id(prev_handle) != graph->get_id(handle)) { + graph->create_edge(prev_handle, flipped_handle); + } + }); + graph->follow_edges(handle, false, [&](handle_t next_handle) { + if (graph->get_id(handle) != graph->get_id(next_handle)) { + graph->create_edge(flipped_handle, next_handle); + } + }); + // self-loop cases we punted on above: + if (graph->has_edge(handle, handle)) { + graph->create_edge(flipped_handle, flipped_handle); + } + if (graph->has_edge(handle, graph->flip(handle))) { + graph->create_edge(flipped_handle, graph->flip(flipped_handle)); + } + if (graph->has_edge(graph->flip(handle), handle)) { + graph->create_edge(graph->flip(flipped_handle), flipped_handle); + } + vector steps = graph->steps_of_handle(handle); + size_t ref_count = 0; + for (step_handle_t step : steps) { + if (graph->get_path_handle_of_step(step) == path_handle) { + ++ref_count; + } + step_handle_t next_step = graph->get_next_step(step); + handle_t new_handle = graph->get_is_reverse(graph->get_handle_of_step(step)) ? flipped_handle : + graph->flip(flipped_handle); + graph->rewrite_segment(step, next_step, {new_handle}); + } + if (ref_count > 1) { + cerr << "[rGFA] error: Cycle detected in rGFA path " << path_name << " at node " << graph->get_id(handle) << endl; + exit(1); + } + ++fw_count; + assert(graph->steps_of_handle(handle).empty()); + dynamic_cast(graph)->destroy_handle(handle); + } + ++total_steps; + }); + } + }); + + // rename all the ids back to what they were (so nodes keep their ids, just get flipped around) + graph->reassign_node_ids([&id_map](nid_t new_id) { + return id_map.count(new_id) ? id_map[new_id] : new_id; + }); + + // do a check just to be sure + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + if (reference_paths.count(path_handle) || get_rgfa_rank(path_name) >= 0) { + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = graph->get_handle_of_step(step_handle); + if (graph->get_is_reverse(handle)) { + cerr << "[rGFA] error: Failed to fowardize node " << graph->get_id(handle) << " in path " << path_name << endl; + exit(1); + } + }); + } + }); +} + + } diff --git a/src/gfa.hpp b/src/gfa.hpp index 3e914c21499..01cfb41e379 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -12,6 +12,8 @@ */ #include "handle.hpp" +#include "snarls.hpp" +#include "traversal_finder.hpp" namespace vg { @@ -26,6 +28,116 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, bool rgfa_pline = false, bool use_w_lines = true); + +/// Prototype code to tag paths as rGFA paths. Either needs to be completely scrapped +/// or adapted into libhandlegraph at some point, ideally. + +/// It works by using a special sample name (default=_rGFA_) for rGFA contigs. +/// Any real sample name gets pushed into the locus field behind its rGFA tag SN:Z: +/// The rGFA rank also goes in the locus field behind SR:i: + +/// In GFA, these paths live in rGFA tags on S elements +/// In the graph, they are reference paths with SN/SR fields in their locus names. +/// As it stands, they will come out as W-lines in GFA with vg view or vg convert (without -Q/-P) + +/// Note that rank-0 rGFA fragments (aka normal reference paths) do *not* get the rGFA +/// sample, and are treated as normal reference paths all the way through (but can get rGFA tags) +/// when specified with -Q/-P in convert -f. + +/// Returns the RGFA rank (SR) of a path. This will be 0 for the reference +/// backbone, and higher the further number of (nested) bubbles away it is. +/// If the path is not an RGFA path, then return -1 +int get_rgfa_rank(const string& path_name); + +/// Remove the rGFA information from a path name, effectively undoing set_rgfa_rank +string strip_rgfa_path_name(const string& path_name); + +/// Clamp a path to given subrange (taking into account an existing subrange) +void clamp_path_subrange(string& path_name, int64_t start, int64_t end); + +/// Add the rgfa rank to a pathname, also setting its sample to the special rgfa sample and +/// moving its old sample into the locus field +string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subrange_t& subrange, + const string& rgfa_sample, + const string& start_parent_path_name = "", + int64_t start_parent_offset = -1, + int64_t start_parent_node_id = -1, + bool start_parent_node_reversed = false, + const string& end_parent_path_name = "", + int64_t end_parent_offset = -1, + int64_t end_parent_node_id = -1, + bool end_parent_node_reversed = false); + + +/// Parse (and remove) all rgfa-specific information, copying it into the various input paramters +/// The rgfa-information-free vg path name is returned (ie the path_name input to carete_rgfa_path_name) +string parse_rgfa_path_name(const string& path_name, int* rgfa_rank = nullptr, + string* rgfa_sample = nullptr, + string* start_parent_rgfa_path_name = nullptr, + int64_t* start_parent_offset = nullptr, + int64_t* start_parent_node_id = nullptr, + bool* start_parent_node_reversed = nullptr, + string* end_parent_rgfa_path_name = nullptr, + int64_t* end_parent_offset = nullptr, + int64_t* end_parent_node_id = nullptr, + bool* end_parent_node_reversed = nullptr); + +/// Like above, but put the rgfa stuff into tags +string parse_rgfa_name_into_tags(const string& path_name, string& rgfa_tags); + + +/// Compute the rGFA path cover +/// graph: the graph +/// snarl_manager: the snarls (todo: should use distance index) +/// reference_paths: rank-0 paths +/// minimum_length: the minimum length of a path to create (alleles shorter than this can be uncovered) +/// preferred_intervals: set of ranges (ex from minigraph) to use as possible for rGFA paths +void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, + SnarlManager* snarl_manager, + const unordered_set& reference_paths, + int64_t minimum_length, + const string& rgfa_sample_name, + const unordered_map>>& preferred_intervals = {}); + +/// Some information about an RGFA fragment we pass around and index below +struct RGFAFragment { + int64_t rank; + vector steps; + int64_t length; + int64_t start_parent_idx; + int64_t end_parent_idx; + step_handle_t start_parent_step; + step_handle_t end_parent_step; +}; + +void rgfa_snarl_cover(const PathHandleGraph* graph, + const Snarl& snarl, + PathTraversalFinder& path_trav_finder, + const unordered_set& reference_paths, + int64_t minimum_length, + vector& cover_fragments, + unordered_map& cover_node_to_fragment, + const unordered_map>>& preferred_intervals); + +/// Get some statistics from a traversal fragment that we use for ranking in the greedy algorithm +/// 1. Coverage : total step length across the traversal for all paths +/// 2. Switches : the number of nodes that would need to be flipped to forwardize the traversal +/// 3. Duplicated bases : the number of duplicated bases in the traversal path +tuple rgfa_traversal_stats(const PathHandleGraph* graph, + const vector& trav, + const pair& trav_fragment); + +/// Comparison of the above stats for the purposes of greedily selecting (highest better) traversals +bool rgfa_traversal_stats_less(const tuple& s1, const tuple& s2); + +/// Make sure all rgfa paths are forwardized +void rgfa_forwardize_paths(MutablePathMutableHandleGraph* graph, + const unordered_set& reference_paths); + +/// Extract rGFA tags from minigraph GFA in order to pass in as hints above +unordered_map>> extract_rgfa_intervals(const string& rgfa_path); + + } #endif diff --git a/src/hts_alignment_emitter.cpp b/src/hts_alignment_emitter.cpp index 2df5a1fac51..63b93b1c903 100644 --- a/src/hts_alignment_emitter.cpp +++ b/src/hts_alignment_emitter.cpp @@ -12,6 +12,7 @@ #include "algorithms/find_translation.hpp" #include #include +#include "gfa.hpp" #include @@ -178,7 +179,10 @@ pair>, unordered_map> extract_path auto& path = get<0>(path_info); auto& own_length = get<1>(path_info); auto& base_length = get<2>(path_info); - string base_path_name = subpath_support ? get_path_base_name(graph, path) : graph.get_path_name(path); + string base_path_name = graph.get_path_name(path); + if (subpath_support && get_rgfa_rank(base_path_name) < 0) { + base_path_name = get_path_base_name(graph, path); + } if (!base_path_set.count(base_path_name)) { path_names_and_lengths.push_back(make_pair(base_path_name, base_length)); base_path_set.insert(base_path_name); @@ -355,9 +359,14 @@ vector> get_sequence_dictionary(const strin // When we find a path or subpath we want, we will keep it. auto keep_path_or_subpath = [&](const path_handle_t& path) { - string base_name = get_path_base_name(graph, path); + string base_name = graph.get_path_name(path); + bool is_rgfa = get_rgfa_rank(base_name) >= 0; + if (!is_rgfa) { + // only process subpaths if not rgfa + base_name = get_path_base_name(graph, path); + } int64_t base_length = -1; - if (graph.get_subrange(path) == PathMetadata::NO_SUBRANGE) { + if (is_rgfa || graph.get_subrange(path) == PathMetadata::NO_SUBRANGE) { // This is a full path so we can determine base length now. base_length = graph.get_path_length(path); } @@ -691,11 +700,13 @@ void HTSAlignmentEmitter::convert_alignment(const Alignment& aln, vector0) rgfa paths get written if a base path specified + if (get_rgfa_rank(path_name) > 0) { + assert(!rgfa_paths.count(path_name)); + rgfa_paths.insert(path_name); + } }); + } graph_to_gfa(graph_to_write, std::cout, rgfa_paths, rgfa_pline, wline); } else { @@ -468,7 +474,7 @@ void help_convert(char** argv) { << "gfa output options (use with -f):" << endl << " -P, --rgfa-path STR write given path as rGFA tags instead of lines (multiple allowed, only rank-0 supported)" << endl << " -Q, --rgfa-prefix STR write paths with given prefix as rGFA tags instead of lines (multiple allowed, only rank-0 supported)" << endl - << " -B, --rgfa-pline paths written as rGFA tags also written as lines" << endl + << " -B, --rgfa-pline paths written as rank 0 rGFA tags also written as lines" << endl << " -W, --no-wline write all paths as GFA P-lines instead of W-lines. Allows handling multiple phase blocks and subranges used together." << endl << " --gbwtgraph-algorithm Always use the GBWTGraph library GFA algorithm. Not compatible with other GBWT output options or non-GBWT graphs." << endl << " --vg-algorithm Always use the VG GFA algorithm. Works with all options and graph types, but can't preserve original GFA coordinates." << endl diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index dd8589947a1..0e62b6de87a 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -16,6 +16,9 @@ #include "../vg.hpp" #include "../xg.hpp" #include "../gbwt_helper.hpp" +#include "../integrated_snarl_finder.hpp" +#include "../gfa.hpp" +#include "../io/save_handle_graph.hpp" #include #include #include @@ -37,6 +40,11 @@ void help_paths(char** argv) { << " -V, --extract-vg output a path-only graph covering the selected paths" << endl << " -d, --drop-paths output a graph with the selected paths removed" << endl << " -r, --retain-paths output a graph with only the selected paths retained" << endl + << " rGFA cover" << endl + << " -R, --rgfa-min-length N add rGFA cover to graph, using seleciton from -Q/-S as rank-0 backbone, only adding fragments >= Nbp (default:-1=disabled)" << endl + << " -N, --rgfa-sample STR give all rGFA cover fragments sample name (path prefix) STR (default: _rGFA_)." << endl + << " -s, --snarls FILE snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover (-R)." << endl + << " -t, --threads N use up to N threads when computing rGFA cover (default: all available)" << endl << " output path data:" << endl << " -X, --extract-gam print (as GAM alignments) the stored paths in the graph" << endl << " -A, --extract-gaf print (as GAF alignments) the stored paths in the graph" << endl @@ -99,6 +107,9 @@ int main_paths(int argc, char** argv) { bool extract_as_fasta = false; bool drop_paths = false; bool retain_paths = false; + int64_t rgfa_min_len = -1; + string rgfa_sample_name = "_rGFA_"; + string snarl_filename; string graph_file; string gbwt_file; string path_prefix; @@ -131,6 +142,9 @@ int main_paths(int argc, char** argv) { {"extract-vg", no_argument, 0, 'V'}, {"drop-paths", no_argument, 0, 'd'}, {"retain-paths", no_argument, 0, 'r'}, + {"rgfa-cover", required_argument, 0, 'R'}, + {"rgfa-sample", required_argument, 0, 'N'}, + {"snarls", required_argument, 0, 's'}, {"extract-gam", no_argument, 0, 'X'}, {"extract-gaf", no_argument, 0, 'A'}, {"list", no_argument, 0, 'L'}, @@ -144,16 +158,15 @@ int main_paths(int argc, char** argv) { {"variant-paths", no_argument, 0, 'a'}, {"generic-paths", no_argument, 0, 'G'}, {"coverage", no_argument, 0, 'c'}, - + {"threads", required_argument, 0, 't'}, // Hidden options for backward compatibility. - {"threads", no_argument, 0, 'T'}, {"threads-by", required_argument, 0, 'q'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:draGp:c", + c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drR:N:s:aGp:ct:", long_options, &option_index); // Detect the end of the options. @@ -182,13 +195,26 @@ int main_paths(int argc, char** argv) { case 'd': drop_paths = true; output_formats++; - break; + break; case 'r': retain_paths = true; output_formats++; break; - + + case 'R': + rgfa_min_len = parse(optarg); + output_formats++; + break; + + case 'N': + rgfa_sample_name = optarg; + break; + + case 's': + snarl_filename = optarg; + break; + case 'X': extract_as_gam = true; output_formats++; @@ -260,9 +286,16 @@ int main_paths(int argc, char** argv) { output_formats++; break; - case 'T': - std::cerr << "warning: [vg paths] option --threads is obsolete and unnecessary" << std::endl; + case 't': + { + int num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg call] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); break; + } case 'q': std::cerr << "warning: [vg paths] option --threads-by is deprecated; please use --paths-by" << std::endl; @@ -299,7 +332,7 @@ int main_paths(int argc, char** argv) { } } if (output_formats != 1) { - std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -L, -F, -E, -C or -c) must be specified" << std::endl; + std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -R, -L, -F, -E, -C or -c) must be specified" << std::endl; std::exit(EXIT_FAILURE); } if (selection_criteria > 1) { @@ -350,9 +383,23 @@ int main_paths(int argc, char** argv) { exit(1); } } - - - + + // Load or compute the snarls + unique_ptr snarl_manager; + if (rgfa_min_len >= 0) { + if (!snarl_filename.empty()) { + ifstream snarl_file(snarl_filename.c_str()); + if (!snarl_file) { + cerr << "Error [vg paths]: Unable to load snarls file: " << snarl_filename << endl; + return 1; + } + snarl_manager = vg::io::VPKG::load_one(snarl_file); + } else { + IntegratedSnarlFinder finder(*graph); + snarl_manager = unique_ptr(new SnarlManager(std::move(finder.find_snarls_parallel()))); + } + } + set path_names; if (!path_file.empty()) { ifstream path_stream(path_file); @@ -545,7 +592,7 @@ int main_paths(int argc, char** argv) { }; - if (drop_paths || retain_paths) { + if (drop_paths || retain_paths || rgfa_min_len >= 0) { MutablePathMutableHandleGraph* mutable_graph = dynamic_cast(graph.get()); if (!mutable_graph) { std::cerr << "error[vg paths]: graph cannot be modified" << std::endl; @@ -557,24 +604,36 @@ int main_paths(int argc, char** argv) { exit(1); } - vector to_destroy; - if (drop_paths) { - for_each_selected_path([&](const path_handle_t& path_handle) { - string name = graph->get_path_name(path_handle); - to_destroy.push_back(name); - }); + if (drop_paths || retain_paths) { + vector to_destroy; + if (drop_paths) { + for_each_selected_path([&](const path_handle_t& path_handle) { + string name = graph->get_path_name(path_handle); + to_destroy.push_back(name); + }); + } else { + for_each_unselected_path([&](const path_handle_t& path_handle) { + string name = graph->get_path_name(path_handle); + to_destroy.push_back(name); + }); + } + for (string& path_name : to_destroy) { + mutable_graph->destroy_path(graph->get_path_handle(path_name)); + } } else { - for_each_unselected_path([&](const path_handle_t& path_handle) { - string name = graph->get_path_name(path_handle); - to_destroy.push_back(name); + assert(rgfa_min_len >= 0); + unordered_set reference_paths; + for_each_selected_path([&](const path_handle_t& path_handle) { + if (get_rgfa_rank(graph->get_path_name(path_handle)) <= 0) { + reference_paths.insert(path_handle); + } }); - } - for (string& path_name : to_destroy) { - mutable_graph->destroy_path(graph->get_path_handle(path_name)); + + rgfa_graph_cover(mutable_graph, snarl_manager.get(), reference_paths, rgfa_min_len, rgfa_sample_name); } // output the graph - serializable_graph->serialize(cout); + vg::io::save_handle_graph(graph.get(), std::cout); } else if (coverage) { // for every node, count the number of unique paths. then add the coverage count to each one diff --git a/test/rgfa/rgfa_ins.gfa b/test/rgfa/rgfa_ins.gfa new file mode 100644 index 00000000000..9751b52ba47 --- /dev/null +++ b/test/rgfa/rgfa_ins.gfa @@ -0,0 +1,15 @@ +H VN:Z:1.0 +S 1 CAAATAAG +S 2 TTT +S 3 TTT +S 4 TTT +S 5 TTT +P x 1+,5+ * +P y 1+,2+,4+,5+ * +P z 1+,2+,3+,4+,5+ * +L 1 + 2 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 4 + 0M +L 4 + 5 + 0M +L 1 + 5 + 0M diff --git a/test/rgfa/rgfa_ins2.gfa b/test/rgfa/rgfa_ins2.gfa new file mode 100644 index 00000000000..ef6f1f3055c --- /dev/null +++ b/test/rgfa/rgfa_ins2.gfa @@ -0,0 +1,18 @@ +H VN:Z:1.0 +S 1 CAAATAAG +S 2 TTT +S 3 TTT +S 4 TTT +S 5 TTT +S 6 TTTTTTTTTT +P x 1+,5+ * +P y 1+,2+,6+,4+,5+ * +P z 1+,2+,3+,4+,5+ * +L 1 + 2 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 4 + 0M +L 4 + 5 + 0M +L 1 + 5 + 0M +L 2 + 6 + 0M +L 6 + 4 + 0M diff --git a/test/rgfa/rgfa_ins3.gfa b/test/rgfa/rgfa_ins3.gfa new file mode 100644 index 00000000000..861ac58d3b5 --- /dev/null +++ b/test/rgfa/rgfa_ins3.gfa @@ -0,0 +1,18 @@ +H VN:Z:1.0 +S 1 CAAATAAG +S 2 TTT +S 3 TTT +S 4 TTT +S 5 TTT +S 6 TTTTTTTTTT +P x 1+,5+ * +P y 5-,4-,6-,2-,1- * +P z 1+,2+,3+,4+,5+ * +L 1 + 2 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 4 + 0M +L 4 + 5 + 0M +L 1 + 5 + 0M +L 2 + 6 + 0M +L 6 + 4 + 0M diff --git a/test/rgfa/rgfa_tiny.gfa b/test/rgfa/rgfa_tiny.gfa new file mode 100644 index 00000000000..498d2e0a6dc --- /dev/null +++ b/test/rgfa/rgfa_tiny.gfa @@ -0,0 +1,38 @@ +H VN:Z:1.0 +S 5 C +S 7 A +S 12 ATAT +S 8 G +S 1 CAAATAAG +S 4 T +S 6 TTG +S 15 CCAACTCTCTG +S 2 A +S 10 A +S 9 AAATTTTCTGGAGTTCTAT +S 11 T +S 13 A +S 14 T +S 3 G +P x 1+,3+,5+,6+,8+,9+,11+,12+,14+,15+ * +P y 1+,2+,4+,6+,8+,9+,10+,12+,13+,15+ * +L 5 + 6 + 0M +L 7 + 9 + 0M +L 12 + 13 + 0M +L 12 + 14 + 0M +L 8 + 9 + 0M +L 1 + 2 + 0M +L 1 + 3 + 0M +L 4 + 6 + 0M +L 6 + 7 + 0M +L 6 + 8 + 0M +L 2 + 4 + 0M +L 2 + 5 + 0M +L 10 + 12 + 0M +L 9 + 10 + 0M +L 9 + 11 + 0M +L 11 + 12 + 0M +L 13 + 15 + 0M +L 14 + 15 + 0M +L 3 + 4 + 0M +L 3 + 5 + 0M diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index 58591697ef5..29d92ba36e4 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="C" # force a consistent sort order -plan tests 18 +plan tests 23 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg @@ -59,4 +59,50 @@ is $? 0 "vg path coverage reports correct lengths in first column" rm -f q.vg q.cov.len q.len +vg paths -v rgfa/rgfa_tiny.gfa -R 1 -Q x -N c | vg convert - -fW > rgfa_tiny.rgfa +printf "P c#0#SN:Z:y:SR:i:1[33-34] 10+ * +P c#0#SN:Z:y:SR:i:1[38-39] 13+ * +P c#0#SN:Z:y:SR:i:1[8-10] 2+,4+ *\n" > rgfa_tiny_expected_fragments.rgfa +grep ^P rgfa_tiny.rgfa | grep SR | sort > rgfa_tiny_fragments.rgfa +diff rgfa_tiny_fragments.rgfa rgfa_tiny_expected_fragments.rgfa +is $? 0 "Found the expected rgfa SNP cover of tiny graph" + +rm -f rgfa_tiny.rgfa rgfa_tiny_expected_fragments.rgfa rgfa_tiny_fragments.rgfa + +vg paths -v rgfa/rgfa_ins.gfa -R 5 -Q x -N c | vg convert - -fW > rgfa_ins.rgfa +printf "P c#0#SN:Z:z:SR:i:1[8-17] 2+,3+,4+ *\n" > rgfa_ins_expected_fragments.rgfa +grep ^P rgfa_ins.rgfa | grep SR | sort > rgfa_ins_fragments.rgfa +diff rgfa_ins_fragments.rgfa rgfa_ins_expected_fragments.rgfa +is $? 0 "Found the expected rgfa cover for simple nested insertion" + +rm -f rgfa_ins.rgfa rgfa_ins_expected_fragments.rgfa rgfa_ins_fragments.rgfa + +vg paths -v rgfa/rgfa_ins2.gfa -R 3 -Q x | vg convert - -fW > rgfa_ins2.rgfa +printf "P _rGFA_#0#SN:Z:y:SR:i:1[8-24] 2+,6+,4+ * +P _rGFA_#0#SN:Z:z:SR:i:2[11-14] 3+ *\n" > rgfa_ins2_expected_fragments.rgfa +grep ^P rgfa_ins2.rgfa | grep SR | sort > rgfa_ins2_fragments.rgfa +diff rgfa_ins2_fragments.rgfa rgfa_ins2_expected_fragments.rgfa +is $? 0 "Found the expected rgfa cover for simple nested insertion that requires two fragments" + +rm -f rgfa_ins2.rgfa rgfa_ins2_expected_fragments.rgfa rgfa_ins2_fragments.rgfa + +vg paths -v rgfa/rgfa_ins2.gfa -R 5 -Q x -N c | vg convert - -fW > rgfa_ins2R5.rgfa +printf "P c#0#SN:Z:y:SR:i:1[8-24] 2+,6+,4+ *\n" > rgfa_ins2R5_expected_fragments.rgfa +grep ^P rgfa_ins2R5.rgfa | grep SR | sort > rgfa_ins2R5_fragments.rgfa +diff rgfa_ins2R5_fragments.rgfa rgfa_ins2R5_expected_fragments.rgfa +is $? 0 "rgfa Minimum fragment length filters out small fragment" + +rm -f rgfa_ins2R5.rgfa rgfa_ins2R5_expected_fragments.rgfa rgfa_ins2R5_fragments.rgfa + +vg paths -v rgfa/rgfa_ins3.gfa -R 3 -Q x -N c | vg convert - -fW > rgfa_ins3.rgfa +printf "P x 1+,5+ * +P c#0#SN:Z:y:SR:i:1[3-19] 4+,6+,2+ * +P y 5-,4+,6+,2+,1- * +P c#0#SN:Z:z:SR:i:2[11-14] 3+ * +P z 1+,2-,3+,4-,5+ *\n" | sort > rgfa_ins3_expected_fragments.rgfa +grep ^P rgfa_ins3.rgfa | sort > rgfa_ins3_fragments.rgfa +diff rgfa_ins3_fragments.rgfa rgfa_ins3_expected_fragments.rgfa +is $? 0 "Found the expected rgfa cover for simple nested insertion that has a reversed path that needs forwardization" + +rm -f rgfa_ins3.rgfa rgfa_ins3_expected_fragments.rgfa rgfa_ins3_fragments.rgfa