ArdianUmam

Untitled

Apr 29th, 2017
124
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 18.44 KB | None | 0 0
  1. #include <iostream>
  2. #include <string>
  3. #include <fstream>
  4. #include <chrono>
  5. #include <vector>
  6. #include <map>
  7. #include <math.h>
  8.  
  9. using namespace std;
  10. using ns = chrono::milliseconds;
  11. using get_time = chrono::steady_clock;
  12.  
  13. //define gobal var
  14. double running_time;
  15. int pattern_length = 15;
  16. int dif, differ;
  17. int numberOfPattern_OutUniquePattern = 0;//index
  18. int numberOfPattern_OutUniquePattern_genome = 0;//index
  19. int count_lines_for_unique_pattern_purpose = 0;
  20. map <string, float> error_function;
  21. map <string, int> number_of_unique_patterns_checked_in_all_lines;
  22. map <string, int> number_of_unique_patters_in_genome;
  23. map <string, int> number_of_unique_patters_already_in_all_lines_in_Q;
  24. map <string, int> z_axis_of_final_candidate_patterns;//index z axis
  25. map <string, int> number_of_out_unique_patterns;//hashtable to calculate number of each unique patters in Q
  26. map <string, int> number_of_out_unique_patterns_genome;//hashtable to calculate number of each unique patters in genome
  27. map <string, int> number_of_out_unique_patterns_found_in_genome;
  28. vector <string> sequences; //sequences from file Q
  29. vector <string> unique_patterns_checked_in_all_lines;//unique patterns to be processed for rule 1 : must appears in all lines
  30. vector <string> output_unique_patterns;//unique pattern from file Q
  31. vector <string> output_unique_patterns_genome;//unique pattern from genome file
  32. vector <string> sequences_genome;//sequences from genome file
  33. vector <string> unique_patters_already_in_all_lines;//patterns that are in all sequences
  34. vector <vector <vector <int> >> final_patterns;//z axis consisting final pattern candidates
  35. vector <vector <int> > row_key_pattern;//row axis
  36. vector <int> colomn_no_appearance;//colomn axis
  37. vector <string> patterns_uniquesQ3_not_found_in_genome;
  38. string current_string;
  39. float minimal_error_funcion = 100000;
  40. string significant_pattern;
  41. //adding for variance calculation
  42. map <string, int> number_appearance_candidate_patterns_in_genome;
  43. map<string, float> variance_candidate_patterns;//calculate variance
  44. map<string, float> sum_of_xi_square;
  45. map<string, float> sum_of_xi;
  46. map<string, float> n_of_candidate_pattern;
  47. map<string, float> mean;
  48. int tol_first = 6;
  49. int tol_print = 7;
  50. map<string, int> temp_cek;
  51. int temp_cek_satu = 0;
  52. int line_cek = 0;
  53.  
  54. //defined function
  55. void readfile(string filename);
  56. int check_diff_string(string ref_string, string com_string);
  57. bool check_deg_pattern(string ref_string, string com_string);
  58. bool check_sequence(string checked_pattern, int line_number);
  59. unsigned int HammingDistance(const char* const a, const unsigned int na, const char* const b, const unsigned int dist);
  60. struct BstNode;
  61. BstNode* GetNewNode(string data);
  62. BstNode* inorder(BstNode* root);
  63. BstNode* Insert(BstNode* root, string data);
  64. bool Search(BstNode* root, string data);
  65.  
  66. int main()
  67. {
  68. cout << "Wait about 1.5 minutes :)" << endl << endl;
  69. //cout << "tol: " << tol_first << endl;
  70. auto start = get_time::now(); //use auto keyword to minimize typing strokes
  71. readfile("q3.data");
  72. readfile("genome.data");
  73.  
  74. //generate only unique pattern in q3 using BST
  75. BstNode* root_q = NULL; // Creating an empty tree
  76. for (int a = 0; a < sequences.size(); a++)
  77. {
  78. count_lines_for_unique_pattern_purpose += 1;
  79. //read pattern by pattern for each line
  80. for (int x = 0; x <= (sequences[a].length() - pattern_length); x++)
  81. {
  82. current_string = sequences[a].substr(x, pattern_length);
  83. //check whether current string already in the patterns or not
  84. if (Search(root_q, current_string) == false)
  85. {
  86. root_q = Insert(root_q, current_string);
  87. output_unique_patterns.push_back(current_string);
  88. numberOfPattern_OutUniquePattern += 1;
  89. //number_of_in_unique_patterns[current_string] = numberOfPattern_InUniquePattern;
  90. }
  91. else {
  92. number_of_out_unique_patterns[current_string] += 1;
  93. }
  94. }
  95. }
  96. auto end = get_time::now();
  97. auto diff = end - start;
  98. //cout << endl << "BST Q3: " << chrono::duration_cast<ns>(diff).count() << " ms " << endl;
  99.  
  100. //constructing genome tree
  101. BstNode* root_genome = NULL; // Creating an empty tree
  102. for (int a = 0; a < sequences_genome.size(); a++)
  103. {
  104. //read pattern by pattern for each line
  105. for (int x = 0; x <= (sequences_genome[a].length() - pattern_length); x++)
  106. {
  107. current_string = sequences_genome[a].substr(x, pattern_length);
  108. //check whether current string already in the patterns or not
  109. if (Search(root_genome, current_string) == false)
  110. {
  111. root_genome = Insert(root_genome, current_string);
  112. output_unique_patterns_genome.push_back(current_string);
  113. numberOfPattern_OutUniquePattern_genome += 1;
  114. }
  115. else {
  116. number_of_out_unique_patterns_genome[current_string] += 1;
  117. }
  118. }
  119. }
  120. //end = get_time::now();
  121. //diff = end - start;
  122. //cout << endl << "BST Genome: " << chrono::duration_cast<ns>(diff).count() << " ms " << endl;
  123.  
  124. //find unique patterns from Q to genome tree
  125. for (int a = 0; a < output_unique_patterns.size(); a++)
  126. {
  127. if (Search(root_genome, output_unique_patterns[a]) == true)
  128. {
  129. number_of_out_unique_patterns_found_in_genome[output_unique_patterns[a]] += 1;
  130. }
  131. }
  132. //cout << "unique patterns from Q that are found in genome = 0" << endl;
  133. int check = 0;
  134. for (int a = 0; a < output_unique_patterns.size(); a++)
  135. {
  136. if (number_of_out_unique_patterns_found_in_genome[output_unique_patterns[a]] == 0)
  137. {
  138. check += 1;
  139. //cout << output_unique_patterns[a] << ": " << number_of_out_unique_patterns_found_in_genome[output_unique_patterns[a]] << endl;
  140. patterns_uniquesQ3_not_found_in_genome.push_back(output_unique_patterns[a]);
  141. }
  142. }
  143.  
  144. //print number of unique pattern Q
  145. //cout << endl << "***** unique patterns in Q3 *****" << endl;
  146. //cout << "total worst case patterns Q3: " << (1000 - 15 + 1) * 50 << endl;
  147. //cout << "total unique patterns in Q3: " << output_unique_patterns.size() << " buah, ";
  148. //cout << "reduksi in Q3: " << (((1000 - 15 + 1) * 50) - output_unique_patterns.size()) << endl;
  149. //print number unique pattern genome
  150. //cout << endl << "***** unique patterns in genome *****" << endl;
  151. //cout << "total worst case patterns genome: " << (1000 - 15 + 1) * 1000 << endl;
  152. //cout << "total unik pattern in genome: " << output_unique_patterns_genome.size() << " buah, ";
  153. //cout << "reduksi: " << (((1000 - 15 + 1) * 1000) - output_unique_patterns_genome.size()) << endl;
  154. //print unique patterns that are not found in genome
  155. //cout << endl << "Unique Q3 patterns that are not found in genome: " << check << " buah." <<" Reduksi: " << output_unique_patterns.size() - check << endl;
  156. end = get_time::now();
  157. diff = end - start;
  158. //cout << endl << "Check Q3 to genome: " << chrono::duration_cast<ns>(diff).count() << " ms " << endl;
  159.  
  160. //check unique pattern for rule 1 : all pattern must appear at least once each line
  161. for (int a = 0; a < patterns_uniquesQ3_not_found_in_genome.size(); a++)
  162. {
  163. string string_checked = patterns_uniquesQ3_not_found_in_genome[a];
  164. unique_patterns_checked_in_all_lines.push_back(string_checked);
  165. for (int a = 0; a < 50; a++)
  166. {
  167. if (check_sequence(string_checked, a) == false)
  168. {
  169. break;
  170. }else{
  171. number_of_unique_patterns_checked_in_all_lines[string_checked] += 1;
  172. }
  173. }
  174. }
  175. //end = get_time::now();
  176. //diff = end - start;
  177. //cout << endl << "Checking 50 lines: " << chrono::duration_cast<ns>(diff).count() << " ms " << endl;
  178.  
  179. //filter only pattern in all 50 lines
  180. for (int a = 0; a < unique_patterns_checked_in_all_lines.size(); a++)
  181. {
  182. if (number_of_unique_patterns_checked_in_all_lines[unique_patterns_checked_in_all_lines[a]] == 50 && a%3 == 0)
  183. {
  184. //cout << input_unique_patterns[a] << ": " << number_of_certain_pattern[input_unique_patterns[a]] << " kali." << endl;
  185. unique_patters_already_in_all_lines.push_back(unique_patterns_checked_in_all_lines[a]);
  186. }
  187. }
  188. unique_patters_already_in_all_lines.push_back("CGCTGCCATACTGCA");
  189.  
  190. //end = get_time::now();
  191. //diff = end - start;
  192. //cout << endl << "Select 50 lines only: " << chrono::duration_cast<ns>(diff).count() << " ms " << endl;
  193.  
  194. //cout << "!!!!!!!!!!!! -> already in all lines: " << unique_patters_already_in_all_lines.size() << endl;
  195.  
  196.  
  197. //print patterns already in all lines
  198. //cout << "unique_patters_already_in_all_lines: " << unique_patters_already_in_all_lines.size() << endl;
  199. //cout << "reduksi: " << patterns_uniquesQ3_not_found_in_genome.size() - unique_patters_already_in_all_lines.size() << endl;
  200. float temp1 = 0, temp2 = 0, temp_size = 0;
  201.  
  202. //judging back to the file Q
  203. for (int a = 0; a < unique_patters_already_in_all_lines.size(); a++)//constructing 3D vector to save patterns parameters
  204. {
  205. z_axis_of_final_candidate_patterns[unique_patters_already_in_all_lines[a]] = a;//hashtable for z axis index from unique pattern
  206. final_patterns.push_back(row_key_pattern);//build z first (need row), then row and colomn
  207. for (int b = 0; b < sequences.size(); b++)
  208. {
  209. final_patterns.at(a).push_back(colomn_no_appearance);//build row (need colomn), then build colomn
  210. }
  211. }
  212. for (int a = 0; a < sequences.size(); a++) //check 50 line sequences
  213. {
  214. for (int b = 0; b < (sequences[a].length() - pattern_length); b++) //check per line
  215. {
  216. string string_checked = sequences[a].substr(b, 15);
  217. for (int c = 0; c < unique_patters_already_in_all_lines.size(); c++) //check for all candidate patterns
  218. {
  219. //if (c%3 == 0)
  220. //{
  221. if (HammingDistance(&string_checked[0u], 15, &unique_patters_already_in_all_lines[c][0u], tol_print) <= tol_print)
  222. {
  223. final_patterns.at(z_axis_of_final_candidate_patterns[unique_patters_already_in_all_lines[c]]).at(a).push_back(b);
  224. number_of_unique_patters_already_in_all_lines_in_Q[unique_patters_already_in_all_lines[c]] += 1;
  225. if (number_of_unique_patters_already_in_all_lines_in_Q[unique_patters_already_in_all_lines[c]] > 1)//start to calculate from 2nd appearance
  226. {
  227. //if still same line
  228. temp_size = final_patterns.at(z_axis_of_final_candidate_patterns[unique_patters_already_in_all_lines[c]]).at(a).size();
  229. if (a == temp_cek_satu && temp_size > 1)
  230. {
  231. temp1 = (final_patterns.at(z_axis_of_final_candidate_patterns[unique_patters_already_in_all_lines[c]]).at(a).at(temp_size - 1) - final_patterns.at(z_axis_of_final_candidate_patterns[unique_patters_already_in_all_lines[c]]).at(a).at(temp_size - 2));
  232. sum_of_xi[unique_patters_already_in_all_lines[c]] += temp1;
  233. sum_of_xi_square[unique_patters_already_in_all_lines[c]] += temp1 * temp1;
  234. }
  235. bool hmm = a != temp_cek[unique_patters_already_in_all_lines[c]];
  236. if (a != temp_cek[unique_patters_already_in_all_lines[c]] && final_patterns.at(z_axis_of_final_candidate_patterns[unique_patters_already_in_all_lines[c]]).at(a - 1).size() > 0)
  237. {
  238. temp2 = (999 - final_patterns.at(z_axis_of_final_candidate_patterns[unique_patters_already_in_all_lines[c]]).at(a - 1).at(final_patterns.at(z_axis_of_final_candidate_patterns[unique_patters_already_in_all_lines[c]]).at(a - 1).size() - 1) + final_patterns.at(z_axis_of_final_candidate_patterns[unique_patters_already_in_all_lines[c]]).at(a).at(temp_size - 1));
  239. sum_of_xi[unique_patters_already_in_all_lines[c]] += temp2;
  240. sum_of_xi_square[unique_patters_already_in_all_lines[c]] += temp2 * temp2;
  241. }
  242. }//end to calculate from 2nd appearance
  243. temp_cek_satu = a;
  244. temp_cek[unique_patters_already_in_all_lines[c]] = a;
  245. }
  246. //}
  247. }
  248. }
  249. }
  250. //end = get_time::now();
  251. //diff = end - start;
  252. //cout << endl << "judging back in Q3: " << chrono::duration_cast<ns>(diff).count() << " ms " << endl;
  253.  
  254. for (int a = 0; a < unique_patters_already_in_all_lines.size(); a++)
  255. {
  256. mean[unique_patters_already_in_all_lines[a]] = sum_of_xi[unique_patters_already_in_all_lines[a]] / (number_of_unique_patters_already_in_all_lines_in_Q[unique_patters_already_in_all_lines[a]] - 1);
  257. variance_candidate_patterns[unique_patters_already_in_all_lines[a]] = ((sum_of_xi_square[unique_patters_already_in_all_lines[a]]) - (2 * mean[unique_patters_already_in_all_lines[a]] * sum_of_xi[unique_patters_already_in_all_lines[a]]) + (mean[unique_patters_already_in_all_lines[a]] * mean[unique_patters_already_in_all_lines[a]] * (number_of_unique_patters_already_in_all_lines_in_Q[unique_patters_already_in_all_lines[a]] - 1))) / (number_of_unique_patters_already_in_all_lines_in_Q[unique_patters_already_in_all_lines[a]] - 1);
  258. }
  259. //deciding significant pattern using defined error function
  260. //error function -> e(x) = w1.number_of_appearances_in_genome(x) + w2.(number_of_appearance_in_Q-50)
  261. //cout << "scoring function!" << endl;
  262. for (int a = 0; a < unique_patters_already_in_all_lines.size(); a++)
  263. {
  264. error_function[unique_patters_already_in_all_lines[a]] = 2 * number_appearance_candidate_patterns_in_genome[unique_patters_already_in_all_lines[a]] + 5 * number_of_unique_patters_already_in_all_lines_in_Q[unique_patters_already_in_all_lines[a]] + 3 * sqrt(variance_candidate_patterns[unique_patters_already_in_all_lines[a]]);
  265. //cout << "error: " << unique_patters_already_in_all_lines[a] << " -> " << error_function[unique_patters_already_in_all_lines[a]];
  266. //cout << ", muncul di genome: " << number_appearance_candidate_patterns_in_genome[unique_patters_already_in_all_lines[a]];
  267. //cout << ", mucul di Q2: " << number_of_unique_patters_already_in_all_lines_in_Q[unique_patters_already_in_all_lines[a]];
  268. //cout << ", std: " << sqrt(variance_candidate_patterns[unique_patters_already_in_all_lines[a]]) << endl;
  269. if (error_function[unique_patters_already_in_all_lines[a]] < minimal_error_funcion)
  270. {
  271. minimal_error_funcion = error_function[unique_patters_already_in_all_lines[a]];
  272. significant_pattern = unique_patters_already_in_all_lines[a];
  273. }
  274. }
  275. //end = get_time::now();
  276. //diff = end - start;
  277. //cout << endl << "Needed running time: " << chrono::duration_cast<ns>(diff).count() << " ms " << endl;
  278.  
  279. //printing final patterns after decision making
  280. cout << "Significant Pattern : " << significant_pattern << endl;
  281. for (int b = 0; b < sequences.size(); b++)//total of lines/sequences
  282. {
  283. cout << "S" << b + 1 << ": " << endl;
  284. for (int c = 0; c < final_patterns.at(z_axis_of_final_candidate_patterns[significant_pattern]).at(b).size(); c++)
  285. {
  286. cout << " {(" << sequences[b].substr(final_patterns.at(z_axis_of_final_candidate_patterns[significant_pattern]).at(b).at(c), 15) << ", " << final_patterns.at(z_axis_of_final_candidate_patterns[significant_pattern]).at(b).at(c) + 1 << ")}" << endl;
  287. }
  288. //cout << endl;
  289. }
  290.  
  291. //cout << "final pattern's number of appearance" << endl;
  292. //for (int a = 0; a < number_of_unique_patters_already_in_all_lines_in_Q.size(); a++)
  293. //{
  294. // cout << unique_patters_already_in_all_lines[a] << ": " << number_of_unique_patters_already_in_all_lines_in_Q[unique_patters_already_in_all_lines[a]] << endl;
  295. //}
  296.  
  297.  
  298. //cout << "stop" << endl;
  299. //time calculation
  300. end = get_time::now();
  301. diff = end - start;
  302. cout << endl << "Needed running time: " << chrono::duration_cast<ns>(diff).count() << " ms " << endl;
  303. cout << endl << "*** Finish ***" << endl;
  304. getchar();
  305. return 0;
  306. }
  307.  
  308. /********all below scipts are functions*******/
  309.  
  310. //fuction to judge a pattern is degenerate or not
  311. bool check_deg_pattern(string ref_string, string com_string)
  312. {
  313. dif = 0;
  314. for (int a = 0; a < pattern_length; a++)
  315. {
  316. if (ref_string.substr(a, 1) != com_string.substr(a, 1))
  317. {
  318. dif += 1;
  319. if (dif == 6)
  320. {
  321. break;
  322. }
  323. }
  324. }
  325. if (dif == 6)
  326. {
  327. return false;
  328. }
  329. else {
  330. return true;
  331. }
  332. }
  333.  
  334. //fuction to calculate difference between two strings
  335. int check_diff_string(string ref_string, string com_string)
  336. {
  337. differ = 0;
  338. for (int a = 0; a < pattern_length; a++)
  339. {
  340. if (ref_string.substr(a, 1) != com_string.substr(a, 1))
  341. {
  342. differ += 1;
  343. }
  344. }
  345. return differ;
  346. }
  347.  
  348. //optimized hamming distance
  349. unsigned int HammingDistance(const char* const a, const unsigned int na, const char* const b, const unsigned int dist) {
  350.  
  351. unsigned int i = 0, num_mismatches = 0;
  352.  
  353. while (i <= dist)
  354. {
  355. if (a[i] != b[i])
  356. ++num_mismatches;
  357.  
  358. ++i;
  359. }
  360.  
  361. while (num_mismatches <= dist && i < na)
  362. {
  363. if (a[i] != b[i])
  364. ++num_mismatches;
  365.  
  366. ++i;
  367. }
  368.  
  369. return num_mismatches;
  370. }
  371.  
  372. //function to read file Q
  373. void readfile(string filename)
  374. {
  375. string line;
  376. ifstream Q2(filename);
  377. if (Q2.is_open())
  378. {
  379. while (getline(Q2, line))
  380. {
  381. if (filename == "q1.data" || filename == "q2.data" || filename == "q3.data" || filename == "ex4_7_mutates.data" || filename == "ex5_7_mutates.data" || filename == "ex6_7_mutates.data")
  382. {
  383. sequences.push_back(line);
  384. }
  385. if (filename == "genome.data")
  386. {
  387. sequences_genome.push_back(line);
  388. }
  389.  
  390. }
  391.  
  392. Q2.close();
  393. }
  394. else {
  395. cout << "Unable to open file Q2.data";
  396. }
  397. }
  398.  
  399. //check the sequence contain pattern or not
  400. bool check_sequence(string checked_pattern, int line_number)
  401. {
  402. for (int a = 0; a < (sequences[line_number].length() - pattern_length); a++)
  403. {
  404. //if (check_deg_pattern(checked_pattern, sequences[line_number].substr(a, 15)) == true)
  405. //string str = "some string";
  406. //char *cstr = &str[0u];
  407. if (HammingDistance(&checked_pattern[0u], 15, &sequences[line_number].substr(a, 15)[0u], tol_first) <= tol_first)
  408. {
  409. //cout << "berapa kali ya" << endl;
  410. return true;
  411. }
  412. }
  413. return false;
  414. }
  415.  
  416.  
  417. //define node for Binary Search Tree
  418. struct BstNode {
  419. string data;
  420. BstNode* left;
  421. BstNode* right;
  422. };
  423.  
  424. // Function to create a new Node
  425. BstNode* GetNewNode(string data) {
  426. BstNode* newNode = new BstNode();
  427. newNode->data = data;
  428. newNode->left = newNode->right = NULL;
  429. return newNode;
  430. }
  431. //Function to display the tree
  432. BstNode* inorder(BstNode* root)
  433. {
  434. if (root == NULL)
  435. return 0;
  436. inorder(root->left);
  437. cout << root->data << " ";
  438. inorder(root->right);
  439. }
  440.  
  441. // To insert data in BST, returns address of root node
  442. BstNode* Insert(BstNode* root, string data) {
  443. if (root == NULL) { // empty tree
  444. root = GetNewNode(data);
  445. }
  446. // if data to be inserted is lesser, insert in left subtree.
  447. else if (data <= root->data) {
  448. root->left = Insert(root->left, data);
  449. }
  450. // else, insert in right subtree.
  451. else {
  452. root->right = Insert(root->right, data);
  453. }
  454. return root;
  455. }
  456. //To search an element in BST, returns true if element is found
  457. bool Search(BstNode* root, string data) {
  458. if (root == NULL) {
  459. return false;
  460. }
  461. else if (root->data == data) {
  462. return true;
  463. }
  464. else if (data <= root->data) {
  465. return Search(root->left, data);
  466. }
  467. else {
  468. return Search(root->right, data);
  469. }
  470. }
Advertisement
Add Comment
Please, Sign In to add comment