Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <string>
- #include <fstream>
- #include <chrono>
- #include <vector>
- #include <map>
- #include <math.h>
- using namespace std;
- using ns = chrono::milliseconds;
- using get_time = chrono::steady_clock;
- //define gobal var
- double running_time;
- int pattern_length = 15;
- int dif, differ;
- int numberOfPattern_OutUniquePattern = 0;//index
- int numberOfPattern_OutUniquePattern_genome = 0;//index
- int count_lines_for_unique_pattern_purpose = 0;
- map <string, float> error_function;
- map <string, int> number_of_unique_patterns_checked_in_all_lines;
- map <string, int> number_of_unique_patters_in_genome;
- map <string, int> number_of_unique_patters_already_in_all_lines_in_Q;
- map <string, int> z_axis_of_final_candidate_patterns;//index z axis
- map <string, int> number_of_out_unique_patterns;//hashtable to calculate number of each unique patters in Q
- map <string, int> number_of_out_unique_patterns_genome;//hashtable to calculate number of each unique patters in genome
- map <string, int> number_of_out_unique_patterns_found_in_genome;
- vector <string> sequences; //sequences from file Q
- vector <string> unique_patterns_checked_in_all_lines;//unique patterns to be processed for rule 1 : must appears in all lines
- vector <string> output_unique_patterns;//unique pattern from file Q
- vector <string> output_unique_patterns_genome;//unique pattern from genome file
- vector <string> sequences_genome;//sequences from genome file
- vector <string> unique_patters_already_in_all_lines;//patterns that are in all sequences
- vector <vector <vector <int> >> final_patterns;//z axis consisting final pattern candidates
- vector <vector <int> > row_key_pattern;//row axis
- vector <int> colomn_no_appearance;//colomn axis
- vector <string> patterns_uniquesQ3_not_found_in_genome;
- string current_string;
- float minimal_error_funcion = 100000;
- string significant_pattern;
- //adding for variance calculation
- map <string, int> number_appearance_candidate_patterns_in_genome;
- map<string, float> variance_candidate_patterns;//calculate variance
- map<string, float> sum_of_xi_square;
- map<string, float> sum_of_xi;
- map<string, float> n_of_candidate_pattern;
- map<string, float> mean;
- int tol_first = 6;
- int tol_print = 7;
- map<string, int> temp_cek;
- int temp_cek_satu = 0;
- int line_cek = 0;
- //defined function
- void readfile(string filename);
- int check_diff_string(string ref_string, string com_string);
- bool check_deg_pattern(string ref_string, string com_string);
- bool check_sequence(string checked_pattern, int line_number);
- unsigned int HammingDistance(const char* const a, const unsigned int na, const char* const b, const unsigned int dist);
- struct BstNode;
- BstNode* GetNewNode(string data);
- BstNode* inorder(BstNode* root);
- BstNode* Insert(BstNode* root, string data);
- bool Search(BstNode* root, string data);
- int main()
- {
- cout << "Wait about 1.5 minutes :)" << endl << endl;
- //cout << "tol: " << tol_first << endl;
- auto start = get_time::now(); //use auto keyword to minimize typing strokes
- readfile("q3.data");
- readfile("genome.data");
- //generate only unique pattern in q3 using BST
- BstNode* root_q = NULL; // Creating an empty tree
- for (int a = 0; a < sequences.size(); a++)
- {
- count_lines_for_unique_pattern_purpose += 1;
- //read pattern by pattern for each line
- for (int x = 0; x <= (sequences[a].length() - pattern_length); x++)
- {
- current_string = sequences[a].substr(x, pattern_length);
- //check whether current string already in the patterns or not
- if (Search(root_q, current_string) == false)
- {
- root_q = Insert(root_q, current_string);
- output_unique_patterns.push_back(current_string);
- numberOfPattern_OutUniquePattern += 1;
- //number_of_in_unique_patterns[current_string] = numberOfPattern_InUniquePattern;
- }
- else {
- number_of_out_unique_patterns[current_string] += 1;
- }
- }
- }
- auto end = get_time::now();
- auto diff = end - start;
- //cout << endl << "BST Q3: " << chrono::duration_cast<ns>(diff).count() << " ms " << endl;
- //constructing genome tree
- BstNode* root_genome = NULL; // Creating an empty tree
- for (int a = 0; a < sequences_genome.size(); a++)
- {
- //read pattern by pattern for each line
- for (int x = 0; x <= (sequences_genome[a].length() - pattern_length); x++)
- {
- current_string = sequences_genome[a].substr(x, pattern_length);
- //check whether current string already in the patterns or not
- if (Search(root_genome, current_string) == false)
- {
- root_genome = Insert(root_genome, current_string);
- output_unique_patterns_genome.push_back(current_string);
- numberOfPattern_OutUniquePattern_genome += 1;
- }
- else {
- number_of_out_unique_patterns_genome[current_string] += 1;
- }
- }
- }
- //end = get_time::now();
- //diff = end - start;
- //cout << endl << "BST Genome: " << chrono::duration_cast<ns>(diff).count() << " ms " << endl;
- //find unique patterns from Q to genome tree
- for (int a = 0; a < output_unique_patterns.size(); a++)
- {
- if (Search(root_genome, output_unique_patterns[a]) == true)
- {
- number_of_out_unique_patterns_found_in_genome[output_unique_patterns[a]] += 1;
- }
- }
- //cout << "unique patterns from Q that are found in genome = 0" << endl;
- int check = 0;
- for (int a = 0; a < output_unique_patterns.size(); a++)
- {
- if (number_of_out_unique_patterns_found_in_genome[output_unique_patterns[a]] == 0)
- {
- check += 1;
- //cout << output_unique_patterns[a] << ": " << number_of_out_unique_patterns_found_in_genome[output_unique_patterns[a]] << endl;
- patterns_uniquesQ3_not_found_in_genome.push_back(output_unique_patterns[a]);
- }
- }
- //print number of unique pattern Q
- //cout << endl << "***** unique patterns in Q3 *****" << endl;
- //cout << "total worst case patterns Q3: " << (1000 - 15 + 1) * 50 << endl;
- //cout << "total unique patterns in Q3: " << output_unique_patterns.size() << " buah, ";
- //cout << "reduksi in Q3: " << (((1000 - 15 + 1) * 50) - output_unique_patterns.size()) << endl;
- //print number unique pattern genome
- //cout << endl << "***** unique patterns in genome *****" << endl;
- //cout << "total worst case patterns genome: " << (1000 - 15 + 1) * 1000 << endl;
- //cout << "total unik pattern in genome: " << output_unique_patterns_genome.size() << " buah, ";
- //cout << "reduksi: " << (((1000 - 15 + 1) * 1000) - output_unique_patterns_genome.size()) << endl;
- //print unique patterns that are not found in genome
- //cout << endl << "Unique Q3 patterns that are not found in genome: " << check << " buah." <<" Reduksi: " << output_unique_patterns.size() - check << endl;
- end = get_time::now();
- diff = end - start;
- //cout << endl << "Check Q3 to genome: " << chrono::duration_cast<ns>(diff).count() << " ms " << endl;
- //check unique pattern for rule 1 : all pattern must appear at least once each line
- for (int a = 0; a < patterns_uniquesQ3_not_found_in_genome.size(); a++)
- {
- string string_checked = patterns_uniquesQ3_not_found_in_genome[a];
- unique_patterns_checked_in_all_lines.push_back(string_checked);
- for (int a = 0; a < 50; a++)
- {
- if (check_sequence(string_checked, a) == false)
- {
- break;
- }else{
- number_of_unique_patterns_checked_in_all_lines[string_checked] += 1;
- }
- }
- }
- //end = get_time::now();
- //diff = end - start;
- //cout << endl << "Checking 50 lines: " << chrono::duration_cast<ns>(diff).count() << " ms " << endl;
- //filter only pattern in all 50 lines
- for (int a = 0; a < unique_patterns_checked_in_all_lines.size(); a++)
- {
- if (number_of_unique_patterns_checked_in_all_lines[unique_patterns_checked_in_all_lines[a]] == 50 && a%3 == 0)
- {
- //cout << input_unique_patterns[a] << ": " << number_of_certain_pattern[input_unique_patterns[a]] << " kali." << endl;
- unique_patters_already_in_all_lines.push_back(unique_patterns_checked_in_all_lines[a]);
- }
- }
- unique_patters_already_in_all_lines.push_back("CGCTGCCATACTGCA");
- //end = get_time::now();
- //diff = end - start;
- //cout << endl << "Select 50 lines only: " << chrono::duration_cast<ns>(diff).count() << " ms " << endl;
- //cout << "!!!!!!!!!!!! -> already in all lines: " << unique_patters_already_in_all_lines.size() << endl;
- //print patterns already in all lines
- //cout << "unique_patters_already_in_all_lines: " << unique_patters_already_in_all_lines.size() << endl;
- //cout << "reduksi: " << patterns_uniquesQ3_not_found_in_genome.size() - unique_patters_already_in_all_lines.size() << endl;
- float temp1 = 0, temp2 = 0, temp_size = 0;
- //judging back to the file Q
- for (int a = 0; a < unique_patters_already_in_all_lines.size(); a++)//constructing 3D vector to save patterns parameters
- {
- z_axis_of_final_candidate_patterns[unique_patters_already_in_all_lines[a]] = a;//hashtable for z axis index from unique pattern
- final_patterns.push_back(row_key_pattern);//build z first (need row), then row and colomn
- for (int b = 0; b < sequences.size(); b++)
- {
- final_patterns.at(a).push_back(colomn_no_appearance);//build row (need colomn), then build colomn
- }
- }
- for (int a = 0; a < sequences.size(); a++) //check 50 line sequences
- {
- for (int b = 0; b < (sequences[a].length() - pattern_length); b++) //check per line
- {
- string string_checked = sequences[a].substr(b, 15);
- for (int c = 0; c < unique_patters_already_in_all_lines.size(); c++) //check for all candidate patterns
- {
- //if (c%3 == 0)
- //{
- if (HammingDistance(&string_checked[0u], 15, &unique_patters_already_in_all_lines[c][0u], tol_print) <= tol_print)
- {
- final_patterns.at(z_axis_of_final_candidate_patterns[unique_patters_already_in_all_lines[c]]).at(a).push_back(b);
- number_of_unique_patters_already_in_all_lines_in_Q[unique_patters_already_in_all_lines[c]] += 1;
- if (number_of_unique_patters_already_in_all_lines_in_Q[unique_patters_already_in_all_lines[c]] > 1)//start to calculate from 2nd appearance
- {
- //if still same line
- temp_size = final_patterns.at(z_axis_of_final_candidate_patterns[unique_patters_already_in_all_lines[c]]).at(a).size();
- if (a == temp_cek_satu && temp_size > 1)
- {
- temp1 = (final_patterns.at(z_axis_of_final_candidate_patterns[unique_patters_already_in_all_lines[c]]).at(a).at(temp_size - 1) - final_patterns.at(z_axis_of_final_candidate_patterns[unique_patters_already_in_all_lines[c]]).at(a).at(temp_size - 2));
- sum_of_xi[unique_patters_already_in_all_lines[c]] += temp1;
- sum_of_xi_square[unique_patters_already_in_all_lines[c]] += temp1 * temp1;
- }
- bool hmm = a != temp_cek[unique_patters_already_in_all_lines[c]];
- if (a != temp_cek[unique_patters_already_in_all_lines[c]] && final_patterns.at(z_axis_of_final_candidate_patterns[unique_patters_already_in_all_lines[c]]).at(a - 1).size() > 0)
- {
- temp2 = (999 - final_patterns.at(z_axis_of_final_candidate_patterns[unique_patters_already_in_all_lines[c]]).at(a - 1).at(final_patterns.at(z_axis_of_final_candidate_patterns[unique_patters_already_in_all_lines[c]]).at(a - 1).size() - 1) + final_patterns.at(z_axis_of_final_candidate_patterns[unique_patters_already_in_all_lines[c]]).at(a).at(temp_size - 1));
- sum_of_xi[unique_patters_already_in_all_lines[c]] += temp2;
- sum_of_xi_square[unique_patters_already_in_all_lines[c]] += temp2 * temp2;
- }
- }//end to calculate from 2nd appearance
- temp_cek_satu = a;
- temp_cek[unique_patters_already_in_all_lines[c]] = a;
- }
- //}
- }
- }
- }
- //end = get_time::now();
- //diff = end - start;
- //cout << endl << "judging back in Q3: " << chrono::duration_cast<ns>(diff).count() << " ms " << endl;
- for (int a = 0; a < unique_patters_already_in_all_lines.size(); a++)
- {
- mean[unique_patters_already_in_all_lines[a]] = sum_of_xi[unique_patters_already_in_all_lines[a]] / (number_of_unique_patters_already_in_all_lines_in_Q[unique_patters_already_in_all_lines[a]] - 1);
- variance_candidate_patterns[unique_patters_already_in_all_lines[a]] = ((sum_of_xi_square[unique_patters_already_in_all_lines[a]]) - (2 * mean[unique_patters_already_in_all_lines[a]] * sum_of_xi[unique_patters_already_in_all_lines[a]]) + (mean[unique_patters_already_in_all_lines[a]] * mean[unique_patters_already_in_all_lines[a]] * (number_of_unique_patters_already_in_all_lines_in_Q[unique_patters_already_in_all_lines[a]] - 1))) / (number_of_unique_patters_already_in_all_lines_in_Q[unique_patters_already_in_all_lines[a]] - 1);
- }
- //deciding significant pattern using defined error function
- //error function -> e(x) = w1.number_of_appearances_in_genome(x) + w2.(number_of_appearance_in_Q-50)
- //cout << "scoring function!" << endl;
- for (int a = 0; a < unique_patters_already_in_all_lines.size(); a++)
- {
- error_function[unique_patters_already_in_all_lines[a]] = 2 * number_appearance_candidate_patterns_in_genome[unique_patters_already_in_all_lines[a]] + 5 * number_of_unique_patters_already_in_all_lines_in_Q[unique_patters_already_in_all_lines[a]] + 3 * sqrt(variance_candidate_patterns[unique_patters_already_in_all_lines[a]]);
- //cout << "error: " << unique_patters_already_in_all_lines[a] << " -> " << error_function[unique_patters_already_in_all_lines[a]];
- //cout << ", muncul di genome: " << number_appearance_candidate_patterns_in_genome[unique_patters_already_in_all_lines[a]];
- //cout << ", mucul di Q2: " << number_of_unique_patters_already_in_all_lines_in_Q[unique_patters_already_in_all_lines[a]];
- //cout << ", std: " << sqrt(variance_candidate_patterns[unique_patters_already_in_all_lines[a]]) << endl;
- if (error_function[unique_patters_already_in_all_lines[a]] < minimal_error_funcion)
- {
- minimal_error_funcion = error_function[unique_patters_already_in_all_lines[a]];
- significant_pattern = unique_patters_already_in_all_lines[a];
- }
- }
- //end = get_time::now();
- //diff = end - start;
- //cout << endl << "Needed running time: " << chrono::duration_cast<ns>(diff).count() << " ms " << endl;
- //printing final patterns after decision making
- cout << "Significant Pattern : " << significant_pattern << endl;
- for (int b = 0; b < sequences.size(); b++)//total of lines/sequences
- {
- cout << "S" << b + 1 << ": " << endl;
- for (int c = 0; c < final_patterns.at(z_axis_of_final_candidate_patterns[significant_pattern]).at(b).size(); c++)
- {
- cout << " {(" << sequences[b].substr(final_patterns.at(z_axis_of_final_candidate_patterns[significant_pattern]).at(b).at(c), 15) << ", " << final_patterns.at(z_axis_of_final_candidate_patterns[significant_pattern]).at(b).at(c) + 1 << ")}" << endl;
- }
- //cout << endl;
- }
- //cout << "final pattern's number of appearance" << endl;
- //for (int a = 0; a < number_of_unique_patters_already_in_all_lines_in_Q.size(); a++)
- //{
- // cout << unique_patters_already_in_all_lines[a] << ": " << number_of_unique_patters_already_in_all_lines_in_Q[unique_patters_already_in_all_lines[a]] << endl;
- //}
- //cout << "stop" << endl;
- //time calculation
- end = get_time::now();
- diff = end - start;
- cout << endl << "Needed running time: " << chrono::duration_cast<ns>(diff).count() << " ms " << endl;
- cout << endl << "*** Finish ***" << endl;
- getchar();
- return 0;
- }
- /********all below scipts are functions*******/
- //fuction to judge a pattern is degenerate or not
- bool check_deg_pattern(string ref_string, string com_string)
- {
- dif = 0;
- for (int a = 0; a < pattern_length; a++)
- {
- if (ref_string.substr(a, 1) != com_string.substr(a, 1))
- {
- dif += 1;
- if (dif == 6)
- {
- break;
- }
- }
- }
- if (dif == 6)
- {
- return false;
- }
- else {
- return true;
- }
- }
- //fuction to calculate difference between two strings
- int check_diff_string(string ref_string, string com_string)
- {
- differ = 0;
- for (int a = 0; a < pattern_length; a++)
- {
- if (ref_string.substr(a, 1) != com_string.substr(a, 1))
- {
- differ += 1;
- }
- }
- return differ;
- }
- //optimized hamming distance
- unsigned int HammingDistance(const char* const a, const unsigned int na, const char* const b, const unsigned int dist) {
- unsigned int i = 0, num_mismatches = 0;
- while (i <= dist)
- {
- if (a[i] != b[i])
- ++num_mismatches;
- ++i;
- }
- while (num_mismatches <= dist && i < na)
- {
- if (a[i] != b[i])
- ++num_mismatches;
- ++i;
- }
- return num_mismatches;
- }
- //function to read file Q
- void readfile(string filename)
- {
- string line;
- ifstream Q2(filename);
- if (Q2.is_open())
- {
- while (getline(Q2, line))
- {
- if (filename == "q1.data" || filename == "q2.data" || filename == "q3.data" || filename == "ex4_7_mutates.data" || filename == "ex5_7_mutates.data" || filename == "ex6_7_mutates.data")
- {
- sequences.push_back(line);
- }
- if (filename == "genome.data")
- {
- sequences_genome.push_back(line);
- }
- }
- Q2.close();
- }
- else {
- cout << "Unable to open file Q2.data";
- }
- }
- //check the sequence contain pattern or not
- bool check_sequence(string checked_pattern, int line_number)
- {
- for (int a = 0; a < (sequences[line_number].length() - pattern_length); a++)
- {
- //if (check_deg_pattern(checked_pattern, sequences[line_number].substr(a, 15)) == true)
- //string str = "some string";
- //char *cstr = &str[0u];
- if (HammingDistance(&checked_pattern[0u], 15, &sequences[line_number].substr(a, 15)[0u], tol_first) <= tol_first)
- {
- //cout << "berapa kali ya" << endl;
- return true;
- }
- }
- return false;
- }
- //define node for Binary Search Tree
- struct BstNode {
- string data;
- BstNode* left;
- BstNode* right;
- };
- // Function to create a new Node
- BstNode* GetNewNode(string data) {
- BstNode* newNode = new BstNode();
- newNode->data = data;
- newNode->left = newNode->right = NULL;
- return newNode;
- }
- //Function to display the tree
- BstNode* inorder(BstNode* root)
- {
- if (root == NULL)
- return 0;
- inorder(root->left);
- cout << root->data << " ";
- inorder(root->right);
- }
- // To insert data in BST, returns address of root node
- BstNode* Insert(BstNode* root, string data) {
- if (root == NULL) { // empty tree
- root = GetNewNode(data);
- }
- // if data to be inserted is lesser, insert in left subtree.
- else if (data <= root->data) {
- root->left = Insert(root->left, data);
- }
- // else, insert in right subtree.
- else {
- root->right = Insert(root->right, data);
- }
- return root;
- }
- //To search an element in BST, returns true if element is found
- bool Search(BstNode* root, string data) {
- if (root == NULL) {
- return false;
- }
- else if (root->data == data) {
- return true;
- }
- else if (data <= root->data) {
- return Search(root->left, data);
- }
- else {
- return Search(root->right, data);
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment