Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <vector>
- #include <fstream>
- #include <algorithm>
- #include <sstream>
- #include <bitset>
- #include <cmath>
- #define FASTA "/home/zuzu/CLionProjects/akwbIII/sample.fasta"
- #define QUAL "/home/zuzu/CLionProjects/akwbIII/sample.qual"
- using namespace std;
- struct sequence {
- string name;
- string nucleotydes;
- vector<long int> qual;
- };
- struct node {
- string name;
- string nucleotydes;
- vector<long int> qual;
- unsigned int number_of_sequence = 0;
- unsigned int position = 0;
- vector<node *> connected_nodes;
- bool operator==(node &other) {
- return name == other.name && nucleotydes == other.nucleotydes && position == other.position;
- }
- };
- struct clique {
- vector<unsigned int> positions;
- vector<unsigned int> number_of_seq;
- };
- struct series_of_cliques {
- vector<clique> series;
- unsigned int length = 0;
- };
- bool find_node(vector<node *> &connected, node &node1) {
- for (auto &it : connected) {
- if (*it == node1) return true;
- }
- return false;
- }
- vector<sequence> read_from_file() {
- vector<sequence> all_sequences;
- sequence one_sequence;
- ifstream file(FASTA);
- while (file.good()) {
- getline(file, one_sequence.name);
- getline(file, one_sequence.nucleotydes);
- all_sequences.push_back(one_sequence);
- }
- file.close();
- file.open(QUAL);
- string name_of_seq;
- string line;
- unsigned int temp_qual;
- while (getline(file, name_of_seq))
- for (auto &sequence_r : all_sequences)
- if (name_of_seq == sequence_r.name) {
- getline(file, line);
- istringstream l(line);
- while (l >> temp_qual) sequence_r.qual.push_back(temp_qual);
- if (sequence_r.nucleotydes.size() != sequence_r.qual.size()) {
- cout << "Wielkość sekwencji nukleotydów nie zgadza się z liczbą oceny wiarygodności nukleotydów"
- << endl;
- cout << "Nukleotydów: " << sequence_r.nucleotydes.size() << endl << "Wiarygodności: "
- << sequence_r.qual.size() << endl;
- exit(1);
- }
- break;
- }
- file.close();
- cout << endl << endl;
- return all_sequences;
- }
- vector<vector<node>> make_graph(int &n, int q, vector<sequence> &all_sequences) {
- vector<node> nodes_in_seq;
- vector<vector<node>> graph;
- unsigned int number = 0;
- unsigned int pos = 0;
- for (auto &sequence_r : all_sequences) {
- pos = 0;
- auto it = sequence_r.nucleotydes.begin() + n;
- auto sec_it = sequence_r.nucleotydes.begin();
- for (unsigned short int i = 0; i < (sequence_r.nucleotydes.length() - n + 1); i++) {
- node temp_node;
- temp_node.number_of_sequence = number;
- temp_node.name = sequence_r.name;
- string temp(sec_it, it);
- temp_node.nucleotydes = temp;
- sec_it++;
- it++;
- temp_node.position = pos;
- if (sequence_r.qual.empty()) fprintf(stderr, "empty");
- for (unsigned int j = pos; j < pos + n; j++) {
- temp_node.qual.push_back(sequence_r.qual[j]);
- }
- pos++;
- nodes_in_seq.push_back(temp_node);
- }
- number++;
- graph.push_back(nodes_in_seq);
- }
- for (auto &sequences : graph)
- for (auto &seqenc : sequences)
- for (unsigned int quality_it = 0; quality_it < seqenc.qual.size(); quality_it++)
- if (seqenc.qual[quality_it] < q) {
- seqenc.nucleotydes.erase(seqenc.nucleotydes.begin() + quality_it);
- seqenc.qual.erase(seqenc.qual.begin() + quality_it);
- }
- for (auto &seq1 : graph)
- for (auto &seq : graph)
- for (auto &node_of_seq : seq1)
- for (auto node_of_other : seq)
- if (node_of_seq.number_of_sequence != node_of_other.number_of_sequence &&
- (node_of_seq.nucleotydes == node_of_other.nucleotydes ||
- node_of_seq.nucleotydes.find(node_of_other.nucleotydes) != string::npos)) {
- if (!find_node(node_of_seq.connected_nodes, node_of_other))
- node_of_seq.connected_nodes.push_back(&node_of_other);
- if (!find_node(node_of_other.connected_nodes, node_of_seq))
- node_of_other.connected_nodes.push_back(&node_of_seq);
- }
- return graph;
- }
- series_of_cliques find_series_of_cliques(vector<vector<node>> &graph) {
- series_of_cliques best_series = series_of_cliques();
- series_of_cliques temp_series = series_of_cliques();
- clique temp_clique = clique();
- clique last = clique();
- int i = 0;
- for (;;) {
- for (auto ¤t_seq : graph[i]) {
- if (last.positions.empty() || current_seq.position == last.positions[i] + 1) {
- for (auto &next_seq : graph[i + 1]) {
- if (temp_clique.positions.empty()) {
- if (find_node(current_seq.connected_nodes, next_seq)) {
- if (last.positions.empty() || next_seq.position == last.positions[i + 1] + 1) {
- temp_clique.positions.push_back(current_seq.position);
- temp_clique.number_of_seq.push_back(current_seq.number_of_sequence);
- temp_clique.positions.push_back(next_seq.position);
- temp_clique.number_of_seq.push_back(next_seq.number_of_sequence);
- break;
- }
- }
- } else {
- if (find(temp_clique.positions.begin(), temp_clique.positions.end(),
- current_seq.position) != temp_clique.positions.end()
- && find_node(current_seq.connected_nodes, next_seq)) {
- if (last.positions.empty() || next_seq.position == last.positions[i + 1] + 1) {
- temp_clique.positions.push_back(next_seq.position);
- temp_clique.number_of_seq.push_back(next_seq.number_of_sequence);
- break;
- }
- }
- }
- }
- }
- }
- if (i + 2 != temp_clique.positions.size()) {
- i = 0;
- if (temp_series.length > best_series.length) {
- best_series = temp_series;
- }
- temp_series.length = 0;
- temp_series.series.clear();
- }
- i++;
- if (temp_clique.positions.size() == graph.size()) {
- temp_series.series.push_back(temp_clique);
- temp_series.length++;
- last = temp_clique;
- temp_clique.positions.clear();
- temp_clique.number_of_seq.clear();
- i = 0;
- }
- if (temp_series.length > best_series.length) {
- best_series = temp_series;
- }
- if (best_series.length > graph[0].size() - best_series.length || i + 1 == graph.size()) {
- break;
- }
- }
- return best_series;
- }
- vector<vector<unsigned int>> comb(unsigned int &K) {
- vector<vector<unsigned int>> results;
- for (unsigned int i = 0; i < pow(2, 8) - 1; i++) {
- bitset<7> t(i);
- if (t.count() == K) {
- vector<unsigned int> v1;
- for (unsigned int j = 0; j < 7; j++)
- if (t[j]) v1.push_back(j);
- results.push_back(v1);
- }
- }
- return results;
- }
- void print_best_clique(int &n, unsigned long number_of_sequences, vector<sequence> &seq, series_of_cliques &best) {
- string temp;
- cout << "Najlepsza seria klik o " << number_of_sequences << " wierzchołkach" << endl;
- cout << "Długość serii: " << best.length << endl;
- for (int i = 0; i < best.series.size(); i++) {
- for (int j = 0; j < number_of_sequences; j++) {
- cout << "Nr sekwencji: " << best.series[0].number_of_seq[j] << " Pozycja: " << best.series[0].positions[j]
- << " ";
- temp = seq[best.series[0].number_of_seq[j]].nucleotydes.substr(
- static_cast<unsigned long>(best.series[0].positions[j]),
- (best.length + n));
- cout << temp << endl;
- }
- }
- cout << " pierwsza pozycja w sekwencji: " << best.series[0].positions[0] << endl;
- }
- int main() {
- int n = 5;
- vector<sequence> seq = read_from_file();
- cout << "read_from_file completed" << endl;
- vector<vector<node>> graph = make_graph(n, 25, seq); // n, q, seq
- cout << "make_graph completed" << endl;
- vector<vector<node>> temp_graph;
- series_of_cliques temp_clique;
- series_of_cliques best = series_of_cliques();
- for (unsigned int k = 4; k <= 7; k++) {
- cout << "k=" << k << " started" << endl;
- vector<vector<unsigned int>> combinations = comb(k);
- for (auto &i : combinations) {
- for (auto &j: i) {
- temp_graph.push_back(graph[j]);
- }
- temp_clique = find_series_of_cliques(temp_graph);
- cout << "find_series_of_cliques completed" << endl;
- if (temp_clique.length > best.length) best = temp_clique;
- }
- print_best_clique(n, combinations[0].size(), seq, best);
- }
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement