Guest User

Untitled

a guest
Oct 23rd, 2018
92
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 32.45 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "# Query connectivity between two nodes"
  8. ]
  9. },
  10. {
  11. "cell_type": "code",
  12. "execution_count": 1,
  13. "metadata": {},
  14. "outputs": [],
  15. "source": [
  16. "import pathlib\n",
  17. "import zipfile\n",
  18. "import collections\n",
  19. "\n",
  20. "import numpy\n",
  21. "import pandas\n",
  22. "import tqdm\n",
  23. "import scipy.sparse\n",
  24. "\n",
  25. "from hetmech.hetmat import HetMat\n",
  26. "import hetmech.degree_group\n",
  27. "import hetmech.pipeline"
  28. ]
  29. },
  30. {
  31. "cell_type": "markdown",
  32. "metadata": {},
  33. "source": [
  34. "## Read degree-grouped permutation archive info"
  35. ]
  36. },
  37. {
  38. "cell_type": "code",
  39. "execution_count": 2,
  40. "metadata": {},
  41. "outputs": [
  42. {
  43. "data": {
  44. "text/html": [
  45. "<div>\n",
  46. "<style scoped>\n",
  47. " .dataframe tbody tr th:only-of-type {\n",
  48. " vertical-align: middle;\n",
  49. " }\n",
  50. "\n",
  51. " .dataframe tbody tr th {\n",
  52. " vertical-align: top;\n",
  53. " }\n",
  54. "\n",
  55. " .dataframe thead th {\n",
  56. " text-align: right;\n",
  57. " }\n",
  58. "</style>\n",
  59. "<table border=\"1\" class=\"dataframe\">\n",
  60. " <thead>\n",
  61. " <tr style=\"text-align: right;\">\n",
  62. " <th></th>\n",
  63. " <th>archive</th>\n",
  64. " <th>filename</th>\n",
  65. " <th>file_size</th>\n",
  66. " <th>compress_type</th>\n",
  67. " <th>compress_size</th>\n",
  68. " <th>CRC</th>\n",
  69. " </tr>\n",
  70. " <tr>\n",
  71. " <th>metapath</th>\n",
  72. " <th></th>\n",
  73. " <th></th>\n",
  74. " <th></th>\n",
  75. " <th></th>\n",
  76. " <th></th>\n",
  77. " <th></th>\n",
  78. " </tr>\n",
  79. " </thead>\n",
  80. " <tbody>\n",
  81. " <tr>\n",
  82. " <th>AdG</th>\n",
  83. " <td>degree-grouped-perms_length-1_damping-0.5-0000...</td>\n",
  84. " <td>adjusted-path-counts/dwpc-0.5/degree-grouped-p...</td>\n",
  85. " <td>29366</td>\n",
  86. " <td>store</td>\n",
  87. " <td>29366</td>\n",
  88. " <td>1169061893</td>\n",
  89. " </tr>\n",
  90. " <tr>\n",
  91. " <th>AeG</th>\n",
  92. " <td>degree-grouped-perms_length-1_damping-0.5-0000...</td>\n",
  93. " <td>adjusted-path-counts/dwpc-0.5/degree-grouped-p...</td>\n",
  94. " <td>321650</td>\n",
  95. " <td>store</td>\n",
  96. " <td>321650</td>\n",
  97. " <td>2872114663</td>\n",
  98. " </tr>\n",
  99. " </tbody>\n",
  100. "</table>\n",
  101. "</div>"
  102. ],
  103. "text/plain": [
  104. " archive \\\n",
  105. "metapath \n",
  106. "AdG degree-grouped-perms_length-1_damping-0.5-0000... \n",
  107. "AeG degree-grouped-perms_length-1_damping-0.5-0000... \n",
  108. "\n",
  109. " filename file_size \\\n",
  110. "metapath \n",
  111. "AdG adjusted-path-counts/dwpc-0.5/degree-grouped-p... 29366 \n",
  112. "AeG adjusted-path-counts/dwpc-0.5/degree-grouped-p... 321650 \n",
  113. "\n",
  114. " compress_type compress_size CRC \n",
  115. "metapath \n",
  116. "AdG store 29366 1169061893 \n",
  117. "AeG store 321650 2872114663 "
  118. ]
  119. },
  120. "execution_count": 2,
  121. "metadata": {},
  122. "output_type": "execute_result"
  123. }
  124. ],
  125. "source": [
  126. "# Read archive locations\n",
  127. "archive_dir = pathlib.Path('../bulk-pipeline/archives-92f40fe')\n",
  128. "dfs = list()\n",
  129. "for length in range(1, 4):\n",
  130. " path = archive_dir / f'degree-grouped-perms_length-{length}_damping-0.5.zip-info.tsv'\n",
  131. " dfs.append(pandas.read_table(path))\n",
  132. "dgp_info_df = pandas.concat(dfs)\n",
  133. "dgp_info_df['metapath'] = dgp_info_df.filename.map(lambda x: x.rsplit('/', 1)[-1].split('.')[0])\n",
  134. "dgp_info_df.set_index('metapath', inplace=True)\n",
  135. "metapath_to_dgp_info = dict(dgp_info_df.iterrows())\n",
  136. "dgp_info_df.head(2)"
  137. ]
  138. },
  139. {
  140. "cell_type": "markdown",
  141. "metadata": {},
  142. "source": [
  143. "## Define functions that will be moved upstream to the hetmech package"
  144. ]
  145. },
  146. {
  147. "cell_type": "code",
  148. "execution_count": 3,
  149. "metadata": {},
  150. "outputs": [],
  151. "source": [
  152. "def dwpc_to_degrees(graph, metapath, damping=0.5, index_pairs=[]):\n",
  153. " \"\"\"\n",
  154. " Yield a description of each cell in a DWPC matrix adding source and target\n",
  155. " node degree info as well as the corresponding path count.\n",
  156. " \"\"\"\n",
  157. " metapath = graph.metagraph.get_metapath(metapath)\n",
  158. " _, _, source_adj_mat = graph.metaedge_to_adjacency_matrix(metapath[0], dense_threshold=0.7)\n",
  159. " _, _, target_adj_mat = graph.metaedge_to_adjacency_matrix(metapath[-1], dense_threshold=0.7)\n",
  160. " source_degrees = source_adj_mat.sum(axis=1).flat\n",
  161. " target_degrees = target_adj_mat.sum(axis=0).flat\n",
  162. " del source_adj_mat, target_adj_mat\n",
  163. "\n",
  164. " source_path = graph.get_nodes_path(metapath.source(), file_format='tsv')\n",
  165. " source_node_df = pandas.read_table(source_path)\n",
  166. " source_node_names = list(source_node_df['name'])\n",
  167. "\n",
  168. " target_path = graph.get_nodes_path(metapath.target(), file_format='tsv')\n",
  169. " target_node_df = pandas.read_table(target_path)\n",
  170. " target_node_names = list(target_node_df['name'])\n",
  171. "\n",
  172. " row_names, col_names, dwpc_matrix = graph.read_path_counts(metapath, 'dwpc', damping)\n",
  173. " dwpc_matrix = numpy.arcsinh(dwpc_matrix / dwpc_matrix.mean())\n",
  174. " if scipy.sparse.issparse(dwpc_matrix):\n",
  175. " dwpc_matrix = dwpc_matrix.toarray()\n",
  176. "\n",
  177. " _, _, path_count = graph.read_path_counts(metapath, 'dwpc', 0.0)\n",
  178. " if scipy.sparse.issparse(path_count):\n",
  179. " path_count = path_count.toarray()\n",
  180. "\n",
  181. " for row_ind, col_ind in index_pairs:\n",
  182. " dwpc_value = dwpc_matrix[row_ind, col_ind]\n",
  183. " row = {\n",
  184. " 'source_id': row_names[row_ind],\n",
  185. " 'target_id': col_names[col_ind],\n",
  186. " 'source_name': source_node_names[row_ind],\n",
  187. " 'target_name': target_node_names[col_ind],\n",
  188. " 'source_degree': source_degrees[row_ind],\n",
  189. " 'target_degree': target_degrees[col_ind],\n",
  190. " 'path_count': path_count[row_ind, col_ind],\n",
  191. " 'dwpc': dwpc_value,\n",
  192. " }\n",
  193. " yield collections.OrderedDict(row)\n",
  194. "\n",
  195. "\n",
  196. "def combine_dwpc_dgp(graph, metapath, damping, index_pairs, max_p_value=1.0):\n",
  197. " \"\"\"\n",
  198. " Combine DWPC information with degree-grouped permutation summary metrics.\n",
  199. " Includes gamma-hurdle significance estimates.\n",
  200. " \"\"\"\n",
  201. " # stats_path = graph.get_running_degree_group_path(metapath, 'dwpc', damping, extension='.tsv.gz')\n",
  202. " # dgp_df = pandas.read_table(stats_path)\n",
  203. " try:\n",
  204. " info = metapath_to_dgp_info[str(metapath)]\n",
  205. " inverted = False\n",
  206. " except KeyError:\n",
  207. " info = metapath_to_dgp_info[str(metapath.inverse)]\n",
  208. " inverted = True\n",
  209. " path = archive_dir / info.archive\n",
  210. " with zipfile.ZipFile(path) as zip_file:\n",
  211. " with zip_file.open(info.filename) as read_file:\n",
  212. " dgp_df = pandas.read_table(read_file, compression='gzip')\n",
  213. " if inverted:\n",
  214. " dgp_df = dgp_df.rename(columns={'source_degree': 'target_degree', 'target_degree': 'source_degree'})\n",
  215. " dgp_df['mean_nz'] = dgp_df['sum'] / dgp_df['nnz']\n",
  216. " dgp_df['sd_nz'] = ((dgp_df['sum_of_squares'] - dgp_df['sum'] ** 2 / dgp_df['nnz']) / (dgp_df['nnz'] - 1)) ** 0.5\n",
  217. " dgp_df['beta'] = dgp_df['mean_nz'] / dgp_df['sd_nz'] ** 2\n",
  218. " dgp_df['alpha'] = dgp_df['mean_nz'] * dgp_df['beta']\n",
  219. " degrees_to_dgp = dgp_df.set_index(['source_degree', 'target_degree']).to_dict(orient='index')\n",
  220. " dwpc_row_generator = dwpc_to_degrees(\n",
  221. " graph, metapath, damping=damping, index_pairs=index_pairs)\n",
  222. " for row in dwpc_row_generator:\n",
  223. " degrees = row['source_degree'], row['target_degree']\n",
  224. " dgp = degrees_to_dgp[degrees]\n",
  225. " row.update(dgp)\n",
  226. " if row['path_count'] == 0:\n",
  227. " row['p_value'] = 1.0\n",
  228. " else:\n",
  229. " row['p_value'] = None if row['sum'] == 0 else (\n",
  230. " row['nnz'] / row['n'] *\n",
  231. " (1 - scipy.special.gammainc(row['alpha'], row['beta'] * row['dwpc']))\n",
  232. " )\n",
  233. " if row['p_value'] is not None and row['p_value'] > max_p_value:\n",
  234. " continue\n",
  235. " for key in ['sum', 'sum_of_squares', 'beta', 'alpha']:\n",
  236. " del row[key]\n",
  237. " yield row\n"
  238. ]
  239. },
  240. {
  241. "cell_type": "markdown",
  242. "metadata": {},
  243. "source": [
  244. "## Specify parameters"
  245. ]
  246. },
  247. {
  248. "cell_type": "code",
  249. "execution_count": 4,
  250. "metadata": {},
  251. "outputs": [],
  252. "source": [
  253. "hetmat = HetMat('../../data/hetionet-v1.0.hetmat/')"
  254. ]
  255. },
  256. {
  257. "cell_type": "code",
  258. "execution_count": 5,
  259. "metadata": {},
  260. "outputs": [],
  261. "source": [
  262. "source_node = 'Gene', 79068 # FTO Gene\n",
  263. "target_node = 'Disease', 'DOID:9970' # Obesity\n",
  264. "\n",
  265. "# set DWPC damping exponent\n",
  266. "damping = 0.5"
  267. ]
  268. },
  269. {
  270. "cell_type": "code",
  271. "execution_count": 6,
  272. "metadata": {},
  273. "outputs": [
  274. {
  275. "data": {
  276. "text/plain": [
  277. "252"
  278. ]
  279. },
  280. "execution_count": 6,
  281. "metadata": {},
  282. "output_type": "execute_result"
  283. }
  284. ],
  285. "source": [
  286. "metapaths = hetmat.metagraph.extract_metapaths(source_node[0], target_node[0], max_length=3)\n",
  287. "len(metapaths)"
  288. ]
  289. },
  290. {
  291. "cell_type": "code",
  292. "execution_count": 7,
  293. "metadata": {},
  294. "outputs": [
  295. {
  296. "data": {
  297. "text/plain": [
  298. "(12358, 136)"
  299. ]
  300. },
  301. "execution_count": 7,
  302. "metadata": {},
  303. "output_type": "execute_result"
  304. }
  305. ],
  306. "source": [
  307. "source_index = hetmat.get_node_identifiers(source_node[0]).index(source_node[1])\n",
  308. "target_index = hetmat.get_node_identifiers(target_node[0]).index(target_node[1])\n",
  309. "source_index, target_index"
  310. ]
  311. },
  312. {
  313. "cell_type": "markdown",
  314. "metadata": {},
  315. "source": [
  316. "## Compute adjusted DWPCs and p-values"
  317. ]
  318. },
  319. {
  320. "cell_type": "code",
  321. "execution_count": 8,
  322. "metadata": {},
  323. "outputs": [
  324. {
  325. "data": {
  326. "application/vnd.jupyter.widget-view+json": {
  327. "model_id": "5a05f10944ae42049304f9328979b1f6",
  328. "version_major": 2,
  329. "version_minor": 0
  330. },
  331. "text/plain": [
  332. "HBox(children=(IntProgress(value=0, max=252), HTML(value='')))"
  333. ]
  334. },
  335. "metadata": {},
  336. "output_type": "display_data"
  337. },
  338. {
  339. "name": "stdout",
  340. "output_type": "stream",
  341. "text": [
  342. "\n"
  343. ]
  344. }
  345. ],
  346. "source": [
  347. "rows = list()\n",
  348. "for metapath in tqdm.tqdm_notebook(metapaths):\n",
  349. " index_pairs = [(source_index, target_index)]\n",
  350. " for row in combine_dwpc_dgp(hetmat, metapath, damping, index_pairs=index_pairs):\n",
  351. " row['metapath'] = str(metapath)\n",
  352. " rows.append(row)\n",
  353. "metapath_df = pandas.DataFrame(rows)"
  354. ]
  355. },
  356. {
  357. "cell_type": "code",
  358. "execution_count": 9,
  359. "metadata": {},
  360. "outputs": [
  361. {
  362. "data": {
  363. "text/html": [
  364. "<div>\n",
  365. "<style scoped>\n",
  366. " .dataframe tbody tr th:only-of-type {\n",
  367. " vertical-align: middle;\n",
  368. " }\n",
  369. "\n",
  370. " .dataframe tbody tr th {\n",
  371. " vertical-align: top;\n",
  372. " }\n",
  373. "\n",
  374. " .dataframe thead th {\n",
  375. " text-align: right;\n",
  376. " }\n",
  377. "</style>\n",
  378. "<table border=\"1\" class=\"dataframe\">\n",
  379. " <thead>\n",
  380. " <tr style=\"text-align: right;\">\n",
  381. " <th></th>\n",
  382. " <th>source_id</th>\n",
  383. " <th>target_id</th>\n",
  384. " <th>source_name</th>\n",
  385. " <th>target_name</th>\n",
  386. " <th>source_degree</th>\n",
  387. " <th>target_degree</th>\n",
  388. " <th>path_count</th>\n",
  389. " <th>dwpc</th>\n",
  390. " <th>n</th>\n",
  391. " <th>nnz</th>\n",
  392. " <th>n_perms</th>\n",
  393. " <th>mean_nz</th>\n",
  394. " <th>sd_nz</th>\n",
  395. " <th>p_value</th>\n",
  396. " <th>metapath</th>\n",
  397. " </tr>\n",
  398. " </thead>\n",
  399. " <tbody>\n",
  400. " <tr>\n",
  401. " <th>0</th>\n",
  402. " <td>79068</td>\n",
  403. " <td>DOID:9970</td>\n",
  404. " <td>FTO</td>\n",
  405. " <td>obesity</td>\n",
  406. " <td>6</td>\n",
  407. " <td>373</td>\n",
  408. " <td>1</td>\n",
  409. " <td>5.267578</td>\n",
  410. " <td>10100</td>\n",
  411. " <td>1740</td>\n",
  412. " <td>100</td>\n",
  413. " <td>5.267578</td>\n",
  414. " <td>6.468376e-08</td>\n",
  415. " <td>0.086139</td>\n",
  416. " <td>GaD</td>\n",
  417. " </tr>\n",
  418. " <tr>\n",
  419. " <th>1</th>\n",
  420. " <td>79068</td>\n",
  421. " <td>DOID:9970</td>\n",
  422. " <td>FTO</td>\n",
  423. " <td>obesity</td>\n",
  424. " <td>2</td>\n",
  425. " <td>45</td>\n",
  426. " <td>0</td>\n",
  427. " <td>0.000000</td>\n",
  428. " <td>106500</td>\n",
  429. " <td>1220</td>\n",
  430. " <td>100</td>\n",
  431. " <td>7.200037</td>\n",
  432. " <td>1.727540e-07</td>\n",
  433. " <td>1.000000</td>\n",
  434. " <td>GdD</td>\n",
  435. " </tr>\n",
  436. " </tbody>\n",
  437. "</table>\n",
  438. "</div>"
  439. ],
  440. "text/plain": [
  441. " source_id target_id source_name target_name source_degree target_degree \\\n",
  442. "0 79068 DOID:9970 FTO obesity 6 373 \n",
  443. "1 79068 DOID:9970 FTO obesity 2 45 \n",
  444. "\n",
  445. " path_count dwpc n nnz n_perms mean_nz sd_nz \\\n",
  446. "0 1 5.267578 10100 1740 100 5.267578 6.468376e-08 \n",
  447. "1 0 0.000000 106500 1220 100 7.200037 1.727540e-07 \n",
  448. "\n",
  449. " p_value metapath \n",
  450. "0 0.086139 GaD \n",
  451. "1 1.000000 GdD "
  452. ]
  453. },
  454. "execution_count": 9,
  455. "metadata": {},
  456. "output_type": "execute_result"
  457. }
  458. ],
  459. "source": [
  460. "metapath_df.head(2)"
  461. ]
  462. },
  463. {
  464. "cell_type": "code",
  465. "execution_count": 10,
  466. "metadata": {},
  467. "outputs": [
  468. {
  469. "data": {
  470. "text/html": [
  471. "<div>\n",
  472. "<style scoped>\n",
  473. " .dataframe tbody tr th:only-of-type {\n",
  474. " vertical-align: middle;\n",
  475. " }\n",
  476. "\n",
  477. " .dataframe tbody tr th {\n",
  478. " vertical-align: top;\n",
  479. " }\n",
  480. "\n",
  481. " .dataframe thead th {\n",
  482. " text-align: right;\n",
  483. " }\n",
  484. "</style>\n",
  485. "<table border=\"1\" class=\"dataframe\">\n",
  486. " <thead>\n",
  487. " <tr style=\"text-align: right;\">\n",
  488. " <th></th>\n",
  489. " <th>metapath</th>\n",
  490. " <th>source_name</th>\n",
  491. " <th>target_name</th>\n",
  492. " <th>source_degree</th>\n",
  493. " <th>target_degree</th>\n",
  494. " <th>path_count</th>\n",
  495. " <th>dwpc</th>\n",
  496. " <th>mean_nz</th>\n",
  497. " <th>n</th>\n",
  498. " <th>nnz</th>\n",
  499. " <th>p_value</th>\n",
  500. " </tr>\n",
  501. " </thead>\n",
  502. " <tbody>\n",
  503. " <tr>\n",
  504. " <th>57</th>\n",
  505. " <td>GpBPpGaD</td>\n",
  506. " <td>FTO</td>\n",
  507. " <td>obesity</td>\n",
  508. " <td>32</td>\n",
  509. " <td>373</td>\n",
  510. " <td>435</td>\n",
  511. " <td>2.814122</td>\n",
  512. " <td>2.100517</td>\n",
  513. " <td>14500</td>\n",
  514. " <td>14500</td>\n",
  515. " <td>4.747076e-08</td>\n",
  516. " </tr>\n",
  517. " <tr>\n",
  518. " <th>41</th>\n",
  519. " <td>GeAeGaD</td>\n",
  520. " <td>FTO</td>\n",
  521. " <td>obesity</td>\n",
  522. " <td>28</td>\n",
  523. " <td>373</td>\n",
  524. " <td>6204</td>\n",
  525. " <td>2.002286</td>\n",
  526. " <td>1.870643</td>\n",
  527. " <td>26500</td>\n",
  528. " <td>26500</td>\n",
  529. " <td>7.739905e-08</td>\n",
  530. " </tr>\n",
  531. " <tr>\n",
  532. " <th>108</th>\n",
  533. " <td>GaDaGaD</td>\n",
  534. " <td>FTO</td>\n",
  535. " <td>obesity</td>\n",
  536. " <td>6</td>\n",
  537. " <td>373</td>\n",
  538. " <td>280</td>\n",
  539. " <td>4.283209</td>\n",
  540. " <td>3.463896</td>\n",
  541. " <td>10100</td>\n",
  542. " <td>10100</td>\n",
  543. " <td>3.328533e-07</td>\n",
  544. " </tr>\n",
  545. " <tr>\n",
  546. " <th>117</th>\n",
  547. " <td>GaDpSpD</td>\n",
  548. " <td>FTO</td>\n",
  549. " <td>obesity</td>\n",
  550. " <td>6</td>\n",
  551. " <td>17</td>\n",
  552. " <td>25</td>\n",
  553. " <td>4.434438</td>\n",
  554. " <td>2.443015</td>\n",
  555. " <td>50500</td>\n",
  556. " <td>50498</td>\n",
  557. " <td>1.351195e-04</td>\n",
  558. " </tr>\n",
  559. " <tr>\n",
  560. " <th>249</th>\n",
  561. " <td>GpPWpGaD</td>\n",
  562. " <td>FTO</td>\n",
  563. " <td>obesity</td>\n",
  564. " <td>1</td>\n",
  565. " <td>373</td>\n",
  566. " <td>2</td>\n",
  567. " <td>3.687043</td>\n",
  568. " <td>1.467271</td>\n",
  569. " <td>107700</td>\n",
  570. " <td>100783</td>\n",
  571. " <td>7.459857e-04</td>\n",
  572. " </tr>\n",
  573. " <tr>\n",
  574. " <th>12</th>\n",
  575. " <td>GaDrD</td>\n",
  576. " <td>FTO</td>\n",
  577. " <td>obesity</td>\n",
  578. " <td>6</td>\n",
  579. " <td>5</td>\n",
  580. " <td>3</td>\n",
  581. " <td>5.138905</td>\n",
  582. " <td>3.917056</td>\n",
  583. " <td>90900</td>\n",
  584. " <td>16242</td>\n",
  585. " <td>2.361471e-03</td>\n",
  586. " </tr>\n",
  587. " <tr>\n",
  588. " <th>107</th>\n",
  589. " <td>GaDrDrD</td>\n",
  590. " <td>FTO</td>\n",
  591. " <td>obesity</td>\n",
  592. " <td>6</td>\n",
  593. " <td>5</td>\n",
  594. " <td>11</td>\n",
  595. " <td>4.850720</td>\n",
  596. " <td>2.435138</td>\n",
  597. " <td>90900</td>\n",
  598. " <td>83771</td>\n",
  599. " <td>3.827697e-03</td>\n",
  600. " </tr>\n",
  601. " <tr>\n",
  602. " <th>102</th>\n",
  603. " <td>GaDlAlD</td>\n",
  604. " <td>FTO</td>\n",
  605. " <td>obesity</td>\n",
  606. " <td>6</td>\n",
  607. " <td>33</td>\n",
  608. " <td>42</td>\n",
  609. " <td>3.744022</td>\n",
  610. " <td>2.730794</td>\n",
  611. " <td>10100</td>\n",
  612. " <td>10100</td>\n",
  613. " <td>6.445393e-03</td>\n",
  614. " </tr>\n",
  615. " <tr>\n",
  616. " <th>166</th>\n",
  617. " <td>GcGiGdD</td>\n",
  618. " <td>FTO</td>\n",
  619. " <td>obesity</td>\n",
  620. " <td>6</td>\n",
  621. " <td>45</td>\n",
  622. " <td>2</td>\n",
  623. " <td>4.111242</td>\n",
  624. " <td>2.415512</td>\n",
  625. " <td>57700</td>\n",
  626. " <td>14148</td>\n",
  627. " <td>1.770697e-02</td>\n",
  628. " </tr>\n",
  629. " <tr>\n",
  630. " <th>133</th>\n",
  631. " <td>GdDpSpD</td>\n",
  632. " <td>FTO</td>\n",
  633. " <td>obesity</td>\n",
  634. " <td>2</td>\n",
  635. " <td>17</td>\n",
  636. " <td>5</td>\n",
  637. " <td>3.383199</td>\n",
  638. " <td>1.939016</td>\n",
  639. " <td>532500</td>\n",
  640. " <td>504211</td>\n",
  641. " <td>2.750328e-02</td>\n",
  642. " </tr>\n",
  643. " <tr>\n",
  644. " <th>185</th>\n",
  645. " <td>GiGuDrD</td>\n",
  646. " <td>FTO</td>\n",
  647. " <td>obesity</td>\n",
  648. " <td>2</td>\n",
  649. " <td>5</td>\n",
  650. " <td>1</td>\n",
  651. " <td>1.130016</td>\n",
  652. " <td>2.068239</td>\n",
  653. " <td>1442700</td>\n",
  654. " <td>61314</td>\n",
  655. " <td>3.331929e-02</td>\n",
  656. " </tr>\n",
  657. " <tr>\n",
  658. " <th>37</th>\n",
  659. " <td>GeAlDrD</td>\n",
  660. " <td>FTO</td>\n",
  661. " <td>obesity</td>\n",
  662. " <td>28</td>\n",
  663. " <td>5</td>\n",
  664. " <td>18</td>\n",
  665. " <td>1.161357</td>\n",
  666. " <td>0.720075</td>\n",
  667. " <td>238500</td>\n",
  668. " <td>238489</td>\n",
  669. " <td>4.016257e-02</td>\n",
  670. " </tr>\n",
  671. " <tr>\n",
  672. " <th>131</th>\n",
  673. " <td>GdDuGdD</td>\n",
  674. " <td>FTO</td>\n",
  675. " <td>obesity</td>\n",
  676. " <td>2</td>\n",
  677. " <td>45</td>\n",
  678. " <td>4</td>\n",
  679. " <td>3.838754</td>\n",
  680. " <td>2.930357</td>\n",
  681. " <td>106500</td>\n",
  682. " <td>72110</td>\n",
  683. " <td>4.128653e-02</td>\n",
  684. " </tr>\n",
  685. " </tbody>\n",
  686. "</table>\n",
  687. "</div>"
  688. ],
  689. "text/plain": [
  690. " metapath source_name target_name source_degree target_degree \\\n",
  691. "57 GpBPpGaD FTO obesity 32 373 \n",
  692. "41 GeAeGaD FTO obesity 28 373 \n",
  693. "108 GaDaGaD FTO obesity 6 373 \n",
  694. "117 GaDpSpD FTO obesity 6 17 \n",
  695. "249 GpPWpGaD FTO obesity 1 373 \n",
  696. "12 GaDrD FTO obesity 6 5 \n",
  697. "107 GaDrDrD FTO obesity 6 5 \n",
  698. "102 GaDlAlD FTO obesity 6 33 \n",
  699. "166 GcGiGdD FTO obesity 6 45 \n",
  700. "133 GdDpSpD FTO obesity 2 17 \n",
  701. "185 GiGuDrD FTO obesity 2 5 \n",
  702. "37 GeAlDrD FTO obesity 28 5 \n",
  703. "131 GdDuGdD FTO obesity 2 45 \n",
  704. "\n",
  705. " path_count dwpc mean_nz n nnz p_value \n",
  706. "57 435 2.814122 2.100517 14500 14500 4.747076e-08 \n",
  707. "41 6204 2.002286 1.870643 26500 26500 7.739905e-08 \n",
  708. "108 280 4.283209 3.463896 10100 10100 3.328533e-07 \n",
  709. "117 25 4.434438 2.443015 50500 50498 1.351195e-04 \n",
  710. "249 2 3.687043 1.467271 107700 100783 7.459857e-04 \n",
  711. "12 3 5.138905 3.917056 90900 16242 2.361471e-03 \n",
  712. "107 11 4.850720 2.435138 90900 83771 3.827697e-03 \n",
  713. "102 42 3.744022 2.730794 10100 10100 6.445393e-03 \n",
  714. "166 2 4.111242 2.415512 57700 14148 1.770697e-02 \n",
  715. "133 5 3.383199 1.939016 532500 504211 2.750328e-02 \n",
  716. "185 1 1.130016 2.068239 1442700 61314 3.331929e-02 \n",
  717. "37 18 1.161357 0.720075 238500 238489 4.016257e-02 \n",
  718. "131 4 3.838754 2.930357 106500 72110 4.128653e-02 "
  719. ]
  720. },
  721. "execution_count": 10,
  722. "metadata": {},
  723. "output_type": "execute_result"
  724. }
  725. ],
  726. "source": [
  727. "# Show nominally significant metapaths\n",
  728. "(\n",
  729. " metapath_df\n",
  730. " .sort_values('p_value')\n",
  731. " .query(\"p_value < 0.05\")\n",
  732. " [['metapath', 'source_name', 'target_name', 'source_degree', 'target_degree', 'path_count', 'dwpc', 'mean_nz', 'n', 'nnz', 'p_value']]\n",
  733. ")"
  734. ]
  735. },
  736. {
  737. "cell_type": "code",
  738. "execution_count": 11,
  739. "metadata": {},
  740. "outputs": [
  741. {
  742. "data": {
  743. "text/html": [
  744. "<div>\n",
  745. "<style scoped>\n",
  746. " .dataframe tbody tr th:only-of-type {\n",
  747. " vertical-align: middle;\n",
  748. " }\n",
  749. "\n",
  750. " .dataframe tbody tr th {\n",
  751. " vertical-align: top;\n",
  752. " }\n",
  753. "\n",
  754. " .dataframe thead th {\n",
  755. " text-align: right;\n",
  756. " }\n",
  757. "</style>\n",
  758. "<table border=\"1\" class=\"dataframe\">\n",
  759. " <thead>\n",
  760. " <tr style=\"text-align: right;\">\n",
  761. " <th></th>\n",
  762. " <th>metapath</th>\n",
  763. " <th>source_name</th>\n",
  764. " <th>target_name</th>\n",
  765. " <th>source_degree</th>\n",
  766. " <th>target_degree</th>\n",
  767. " <th>path_count</th>\n",
  768. " <th>dwpc</th>\n",
  769. " <th>mean_nz</th>\n",
  770. " <th>n</th>\n",
  771. " <th>nnz</th>\n",
  772. " <th>p_value</th>\n",
  773. " </tr>\n",
  774. " </thead>\n",
  775. " <tbody>\n",
  776. " <tr>\n",
  777. " <th>91</th>\n",
  778. " <td>GuCpDrD</td>\n",
  779. " <td>FTO</td>\n",
  780. " <td>obesity</td>\n",
  781. " <td>0</td>\n",
  782. " <td>5</td>\n",
  783. " <td>0</td>\n",
  784. " <td>0.0</td>\n",
  785. " <td>NaN</td>\n",
  786. " <td>15928200</td>\n",
  787. " <td>0</td>\n",
  788. " <td>1.0</td>\n",
  789. " </tr>\n",
  790. " <tr>\n",
  791. " <th>92</th>\n",
  792. " <td>GuCtDrD</td>\n",
  793. " <td>FTO</td>\n",
  794. " <td>obesity</td>\n",
  795. " <td>0</td>\n",
  796. " <td>5</td>\n",
  797. " <td>0</td>\n",
  798. " <td>0.0</td>\n",
  799. " <td>NaN</td>\n",
  800. " <td>15928200</td>\n",
  801. " <td>0</td>\n",
  802. " <td>1.0</td>\n",
  803. " </tr>\n",
  804. " <tr>\n",
  805. " <th>93</th>\n",
  806. " <td>GuCbGaD</td>\n",
  807. " <td>FTO</td>\n",
  808. " <td>obesity</td>\n",
  809. " <td>0</td>\n",
  810. " <td>373</td>\n",
  811. " <td>0</td>\n",
  812. " <td>0.0</td>\n",
  813. " <td>NaN</td>\n",
  814. " <td>1769800</td>\n",
  815. " <td>0</td>\n",
  816. " <td>1.0</td>\n",
  817. " </tr>\n",
  818. " <tr>\n",
  819. " <th>95</th>\n",
  820. " <td>GuCbGuD</td>\n",
  821. " <td>FTO</td>\n",
  822. " <td>obesity</td>\n",
  823. " <td>0</td>\n",
  824. " <td>74</td>\n",
  825. " <td>0</td>\n",
  826. " <td>0.0</td>\n",
  827. " <td>NaN</td>\n",
  828. " <td>1769800</td>\n",
  829. " <td>0</td>\n",
  830. " <td>1.0</td>\n",
  831. " </tr>\n",
  832. " <tr>\n",
  833. " <th>251</th>\n",
  834. " <td>GpPWpGuD</td>\n",
  835. " <td>FTO</td>\n",
  836. " <td>obesity</td>\n",
  837. " <td>1</td>\n",
  838. " <td>74</td>\n",
  839. " <td>0</td>\n",
  840. " <td>0.0</td>\n",
  841. " <td>1.658241</td>\n",
  842. " <td>107700</td>\n",
  843. " <td>56287</td>\n",
  844. " <td>1.0</td>\n",
  845. " </tr>\n",
  846. " </tbody>\n",
  847. "</table>\n",
  848. "</div>"
  849. ],
  850. "text/plain": [
  851. " metapath source_name target_name source_degree target_degree \\\n",
  852. "91 GuCpDrD FTO obesity 0 5 \n",
  853. "92 GuCtDrD FTO obesity 0 5 \n",
  854. "93 GuCbGaD FTO obesity 0 373 \n",
  855. "95 GuCbGuD FTO obesity 0 74 \n",
  856. "251 GpPWpGuD FTO obesity 1 74 \n",
  857. "\n",
  858. " path_count dwpc mean_nz n nnz p_value \n",
  859. "91 0 0.0 NaN 15928200 0 1.0 \n",
  860. "92 0 0.0 NaN 15928200 0 1.0 \n",
  861. "93 0 0.0 NaN 1769800 0 1.0 \n",
  862. "95 0 0.0 NaN 1769800 0 1.0 \n",
  863. "251 0 0.0 1.658241 107700 56287 1.0 "
  864. ]
  865. },
  866. "execution_count": 11,
  867. "metadata": {},
  868. "output_type": "execute_result"
  869. }
  870. ],
  871. "source": [
  872. "# Non-significant metapaths\n",
  873. "(\n",
  874. " metapath_df\n",
  875. " .sort_values('p_value')\n",
  876. " .tail()\n",
  877. " [['metapath', 'source_name', 'target_name', 'source_degree', 'target_degree', 'path_count', 'dwpc', 'mean_nz', 'n', 'nnz', 'p_value']]\n",
  878. ")"
  879. ]
  880. },
  881. {
  882. "cell_type": "markdown",
  883. "metadata": {},
  884. "source": [
  885. "## Hetionet Neo4j Queries"
  886. ]
  887. },
  888. {
  889. "cell_type": "markdown",
  890. "metadata": {},
  891. "source": [
  892. "To create DWPC queries for other metapaths, you can use the following method:\n",
  893. "\n",
  894. "\n",
  895. "```python\n",
  896. "import hetio.neo4j\n",
  897. "metapath = hetmat.metagraph.get_metapath('GaDpSpD')\n",
  898. "query = hetio.neo4j.construct_dwpc_query(metapath)\n",
  899. "print(query)\n",
  900. "```\n",
  901. "\n",
  902. "\n",
  903. "## Top _GpBPpGaD_ paths\n",
  904. "\n",
  905. "```cypher\n",
  906. "MATCH path = (n0:Gene)-[:PARTICIPATES_GpBP]-(n1)-[:PARTICIPATES_GpBP]-(n2)-[:ASSOCIATES_DaG]-(n3:Disease)\n",
  907. "WHERE n0.name = 'FTO'\n",
  908. " AND n3.name = 'obesity'\n",
  909. "AND n0 <> n2\n",
  910. "WITH [\n",
  911. " size((n0)-[:PARTICIPATES_GpBP]-()),\n",
  912. " size(()-[:PARTICIPATES_GpBP]-(n1)),\n",
  913. " size((n1)-[:PARTICIPATES_GpBP]-()),\n",
  914. " size(()-[:PARTICIPATES_GpBP]-(n2)),\n",
  915. " size((n2)-[:ASSOCIATES_DaG]-()),\n",
  916. " size(()-[:ASSOCIATES_DaG]-(n3))\n",
  917. "] AS degrees, path\n",
  918. "RETURN\n",
  919. " path,\n",
  920. " substring(reduce(string = '', node IN nodes(path) | string + '—' + node.name), 1) AS nodes,\n",
  921. " reduce(pdp = 1.0, d in degrees | pdp * d ^ -0.5) AS pdp\n",
  922. "ORDER BY pdp DESC\n",
  923. "LIMIT 10\n",
  924. "```\n",
  925. "\n",
  926. "## Top _GeAeGaD_ paths\n",
  927. "\n",
  928. "```cypher\n",
  929. "MATCH path = (n0:Gene)-[:EXPRESSES_AeG]-(n1)-[:EXPRESSES_AeG]-(n2)-[:ASSOCIATES_DaG]-(n3:Disease)\n",
  930. "WHERE n0.name = 'FTO'\n",
  931. " AND n3.name = 'obesity'\n",
  932. "AND n0 <> n2\n",
  933. "WITH [\n",
  934. "size((n0)-[:EXPRESSES_AeG]-()),\n",
  935. "size(()-[:EXPRESSES_AeG]-(n1)),\n",
  936. "size((n1)-[:EXPRESSES_AeG]-()),\n",
  937. "size(()-[:EXPRESSES_AeG]-(n2)),\n",
  938. "size((n2)-[:ASSOCIATES_DaG]-()),\n",
  939. "size(()-[:ASSOCIATES_DaG]-(n3))\n",
  940. "] AS degrees, path\n",
  941. "RETURN\n",
  942. " path,\n",
  943. " substring(reduce(string = '', node IN nodes(path) | string + '—' + node.name), 1) AS nodes,\n",
  944. " reduce(pdp = 1.0, d in degrees | pdp * d ^ -0.5) AS pdp\n",
  945. "ORDER BY pdp DESC\n",
  946. "LIMIT 10\n",
  947. "```\n",
  948. "\n",
  949. "\n",
  950. "## Top _GaDpSpD_ paths\n",
  951. "\n",
  952. "```cypher\n",
  953. "MATCH path = (n0:Gene)-[:ASSOCIATES_DaG]-(n1)-[:PRESENTS_DpS]-(n2)-[:PRESENTS_DpS]-(n3:Disease)\n",
  954. "WHERE n0.name = 'FTO'\n",
  955. " AND n3.name = 'obesity'\n",
  956. "AND n1 <> n3\n",
  957. "WITH [\n",
  958. "size((n0)-[:ASSOCIATES_DaG]-()),\n",
  959. "size(()-[:ASSOCIATES_DaG]-(n1)),\n",
  960. "size((n1)-[:PRESENTS_DpS]-()),\n",
  961. "size(()-[:PRESENTS_DpS]-(n2)),\n",
  962. "size((n2)-[:PRESENTS_DpS]-()),\n",
  963. "size(()-[:PRESENTS_DpS]-(n3))\n",
  964. "] AS degrees, path\n",
  965. "RETURN\n",
  966. " path,\n",
  967. " substring(reduce(string = '', node IN nodes(path) | string + '—' + node.name), 1) AS nodes,\n",
  968. " reduce(pdp = 1.0, d in degrees | pdp * d ^ -0.5) AS pdp\n",
  969. "ORDER BY pdp DESC\n",
  970. "LIMIT 10\n",
  971. "```"
  972. ]
  973. }
  974. ],
  975. "metadata": {
  976. "kernelspec": {
  977. "display_name": "Python [conda env:hetmech]",
  978. "language": "python",
  979. "name": "conda-env-hetmech-py"
  980. },
  981. "language_info": {
  982. "codemirror_mode": {
  983. "name": "ipython",
  984. "version": 3
  985. },
  986. "file_extension": ".py",
  987. "mimetype": "text/x-python",
  988. "name": "python",
  989. "nbconvert_exporter": "python",
  990. "pygments_lexer": "ipython3",
  991. "version": "3.6.6"
  992. }
  993. },
  994. "nbformat": 4,
  995. "nbformat_minor": 2
  996. }
Add Comment
Please, Sign In to add comment