Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Query connectivity between two nodes"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pathlib\n",
- "import zipfile\n",
- "import collections\n",
- "\n",
- "import numpy\n",
- "import pandas\n",
- "import tqdm\n",
- "import scipy.sparse\n",
- "\n",
- "from hetmech.hetmat import HetMat\n",
- "import hetmech.degree_group\n",
- "import hetmech.pipeline"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Read degree-grouped permutation archive info"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>archive</th>\n",
- " <th>filename</th>\n",
- " <th>file_size</th>\n",
- " <th>compress_type</th>\n",
- " <th>compress_size</th>\n",
- " <th>CRC</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>metapath</th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>AdG</th>\n",
- " <td>degree-grouped-perms_length-1_damping-0.5-0000...</td>\n",
- " <td>adjusted-path-counts/dwpc-0.5/degree-grouped-p...</td>\n",
- " <td>29366</td>\n",
- " <td>store</td>\n",
- " <td>29366</td>\n",
- " <td>1169061893</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>AeG</th>\n",
- " <td>degree-grouped-perms_length-1_damping-0.5-0000...</td>\n",
- " <td>adjusted-path-counts/dwpc-0.5/degree-grouped-p...</td>\n",
- " <td>321650</td>\n",
- " <td>store</td>\n",
- " <td>321650</td>\n",
- " <td>2872114663</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " archive \\\n",
- "metapath \n",
- "AdG degree-grouped-perms_length-1_damping-0.5-0000... \n",
- "AeG degree-grouped-perms_length-1_damping-0.5-0000... \n",
- "\n",
- " filename file_size \\\n",
- "metapath \n",
- "AdG adjusted-path-counts/dwpc-0.5/degree-grouped-p... 29366 \n",
- "AeG adjusted-path-counts/dwpc-0.5/degree-grouped-p... 321650 \n",
- "\n",
- " compress_type compress_size CRC \n",
- "metapath \n",
- "AdG store 29366 1169061893 \n",
- "AeG store 321650 2872114663 "
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Read archive locations\n",
- "archive_dir = pathlib.Path('../bulk-pipeline/archives-92f40fe')\n",
- "dfs = list()\n",
- "for length in range(1, 4):\n",
- " path = archive_dir / f'degree-grouped-perms_length-{length}_damping-0.5.zip-info.tsv'\n",
- " dfs.append(pandas.read_table(path))\n",
- "dgp_info_df = pandas.concat(dfs)\n",
- "dgp_info_df['metapath'] = dgp_info_df.filename.map(lambda x: x.rsplit('/', 1)[-1].split('.')[0])\n",
- "dgp_info_df.set_index('metapath', inplace=True)\n",
- "metapath_to_dgp_info = dict(dgp_info_df.iterrows())\n",
- "dgp_info_df.head(2)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Define functions that will be moved upstream to the hetmech package"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "def dwpc_to_degrees(graph, metapath, damping=0.5, index_pairs=[]):\n",
- " \"\"\"\n",
- " Yield a description of each cell in a DWPC matrix adding source and target\n",
- " node degree info as well as the corresponding path count.\n",
- " \"\"\"\n",
- " metapath = graph.metagraph.get_metapath(metapath)\n",
- " _, _, source_adj_mat = graph.metaedge_to_adjacency_matrix(metapath[0], dense_threshold=0.7)\n",
- " _, _, target_adj_mat = graph.metaedge_to_adjacency_matrix(metapath[-1], dense_threshold=0.7)\n",
- " source_degrees = source_adj_mat.sum(axis=1).flat\n",
- " target_degrees = target_adj_mat.sum(axis=0).flat\n",
- " del source_adj_mat, target_adj_mat\n",
- "\n",
- " source_path = graph.get_nodes_path(metapath.source(), file_format='tsv')\n",
- " source_node_df = pandas.read_table(source_path)\n",
- " source_node_names = list(source_node_df['name'])\n",
- "\n",
- " target_path = graph.get_nodes_path(metapath.target(), file_format='tsv')\n",
- " target_node_df = pandas.read_table(target_path)\n",
- " target_node_names = list(target_node_df['name'])\n",
- "\n",
- " row_names, col_names, dwpc_matrix = graph.read_path_counts(metapath, 'dwpc', damping)\n",
- " dwpc_matrix = numpy.arcsinh(dwpc_matrix / dwpc_matrix.mean())\n",
- " if scipy.sparse.issparse(dwpc_matrix):\n",
- " dwpc_matrix = dwpc_matrix.toarray()\n",
- "\n",
- " _, _, path_count = graph.read_path_counts(metapath, 'dwpc', 0.0)\n",
- " if scipy.sparse.issparse(path_count):\n",
- " path_count = path_count.toarray()\n",
- "\n",
- " for row_ind, col_ind in index_pairs:\n",
- " dwpc_value = dwpc_matrix[row_ind, col_ind]\n",
- " row = {\n",
- " 'source_id': row_names[row_ind],\n",
- " 'target_id': col_names[col_ind],\n",
- " 'source_name': source_node_names[row_ind],\n",
- " 'target_name': target_node_names[col_ind],\n",
- " 'source_degree': source_degrees[row_ind],\n",
- " 'target_degree': target_degrees[col_ind],\n",
- " 'path_count': path_count[row_ind, col_ind],\n",
- " 'dwpc': dwpc_value,\n",
- " }\n",
- " yield collections.OrderedDict(row)\n",
- "\n",
- "\n",
- "def combine_dwpc_dgp(graph, metapath, damping, index_pairs, max_p_value=1.0):\n",
- " \"\"\"\n",
- " Combine DWPC information with degree-grouped permutation summary metrics.\n",
- " Includes gamma-hurdle significance estimates.\n",
- " \"\"\"\n",
- " # stats_path = graph.get_running_degree_group_path(metapath, 'dwpc', damping, extension='.tsv.gz')\n",
- " # dgp_df = pandas.read_table(stats_path)\n",
- " try:\n",
- " info = metapath_to_dgp_info[str(metapath)]\n",
- " inverted = False\n",
- " except KeyError:\n",
- " info = metapath_to_dgp_info[str(metapath.inverse)]\n",
- " inverted = True\n",
- " path = archive_dir / info.archive\n",
- " with zipfile.ZipFile(path) as zip_file:\n",
- " with zip_file.open(info.filename) as read_file:\n",
- " dgp_df = pandas.read_table(read_file, compression='gzip')\n",
- " if inverted:\n",
- " dgp_df = dgp_df.rename(columns={'source_degree': 'target_degree', 'target_degree': 'source_degree'})\n",
- " dgp_df['mean_nz'] = dgp_df['sum'] / dgp_df['nnz']\n",
- " dgp_df['sd_nz'] = ((dgp_df['sum_of_squares'] - dgp_df['sum'] ** 2 / dgp_df['nnz']) / (dgp_df['nnz'] - 1)) ** 0.5\n",
- " dgp_df['beta'] = dgp_df['mean_nz'] / dgp_df['sd_nz'] ** 2\n",
- " dgp_df['alpha'] = dgp_df['mean_nz'] * dgp_df['beta']\n",
- " degrees_to_dgp = dgp_df.set_index(['source_degree', 'target_degree']).to_dict(orient='index')\n",
- " dwpc_row_generator = dwpc_to_degrees(\n",
- " graph, metapath, damping=damping, index_pairs=index_pairs)\n",
- " for row in dwpc_row_generator:\n",
- " degrees = row['source_degree'], row['target_degree']\n",
- " dgp = degrees_to_dgp[degrees]\n",
- " row.update(dgp)\n",
- " if row['path_count'] == 0:\n",
- " row['p_value'] = 1.0\n",
- " else:\n",
- " row['p_value'] = None if row['sum'] == 0 else (\n",
- " row['nnz'] / row['n'] *\n",
- " (1 - scipy.special.gammainc(row['alpha'], row['beta'] * row['dwpc']))\n",
- " )\n",
- " if row['p_value'] is not None and row['p_value'] > max_p_value:\n",
- " continue\n",
- " for key in ['sum', 'sum_of_squares', 'beta', 'alpha']:\n",
- " del row[key]\n",
- " yield row\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Specify parameters"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "hetmat = HetMat('../../data/hetionet-v1.0.hetmat/')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "source_node = 'Gene', 79068 # FTO Gene\n",
- "target_node = 'Disease', 'DOID:9970' # Obesity\n",
- "\n",
- "# set DWPC damping exponent\n",
- "damping = 0.5"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "252"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "metapaths = hetmat.metagraph.extract_metapaths(source_node[0], target_node[0], max_length=3)\n",
- "len(metapaths)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(12358, 136)"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "source_index = hetmat.get_node_identifiers(source_node[0]).index(source_node[1])\n",
- "target_index = hetmat.get_node_identifiers(target_node[0]).index(target_node[1])\n",
- "source_index, target_index"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Compute adjusted DWPCs and p-values"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "5a05f10944ae42049304f9328979b1f6",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "HBox(children=(IntProgress(value=0, max=252), HTML(value='')))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- }
- ],
- "source": [
- "rows = list()\n",
- "for metapath in tqdm.tqdm_notebook(metapaths):\n",
- " index_pairs = [(source_index, target_index)]\n",
- " for row in combine_dwpc_dgp(hetmat, metapath, damping, index_pairs=index_pairs):\n",
- " row['metapath'] = str(metapath)\n",
- " rows.append(row)\n",
- "metapath_df = pandas.DataFrame(rows)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>source_id</th>\n",
- " <th>target_id</th>\n",
- " <th>source_name</th>\n",
- " <th>target_name</th>\n",
- " <th>source_degree</th>\n",
- " <th>target_degree</th>\n",
- " <th>path_count</th>\n",
- " <th>dwpc</th>\n",
- " <th>n</th>\n",
- " <th>nnz</th>\n",
- " <th>n_perms</th>\n",
- " <th>mean_nz</th>\n",
- " <th>sd_nz</th>\n",
- " <th>p_value</th>\n",
- " <th>metapath</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>79068</td>\n",
- " <td>DOID:9970</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>6</td>\n",
- " <td>373</td>\n",
- " <td>1</td>\n",
- " <td>5.267578</td>\n",
- " <td>10100</td>\n",
- " <td>1740</td>\n",
- " <td>100</td>\n",
- " <td>5.267578</td>\n",
- " <td>6.468376e-08</td>\n",
- " <td>0.086139</td>\n",
- " <td>GaD</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>79068</td>\n",
- " <td>DOID:9970</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>2</td>\n",
- " <td>45</td>\n",
- " <td>0</td>\n",
- " <td>0.000000</td>\n",
- " <td>106500</td>\n",
- " <td>1220</td>\n",
- " <td>100</td>\n",
- " <td>7.200037</td>\n",
- " <td>1.727540e-07</td>\n",
- " <td>1.000000</td>\n",
- " <td>GdD</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " source_id target_id source_name target_name source_degree target_degree \\\n",
- "0 79068 DOID:9970 FTO obesity 6 373 \n",
- "1 79068 DOID:9970 FTO obesity 2 45 \n",
- "\n",
- " path_count dwpc n nnz n_perms mean_nz sd_nz \\\n",
- "0 1 5.267578 10100 1740 100 5.267578 6.468376e-08 \n",
- "1 0 0.000000 106500 1220 100 7.200037 1.727540e-07 \n",
- "\n",
- " p_value metapath \n",
- "0 0.086139 GaD \n",
- "1 1.000000 GdD "
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "metapath_df.head(2)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>metapath</th>\n",
- " <th>source_name</th>\n",
- " <th>target_name</th>\n",
- " <th>source_degree</th>\n",
- " <th>target_degree</th>\n",
- " <th>path_count</th>\n",
- " <th>dwpc</th>\n",
- " <th>mean_nz</th>\n",
- " <th>n</th>\n",
- " <th>nnz</th>\n",
- " <th>p_value</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>57</th>\n",
- " <td>GpBPpGaD</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>32</td>\n",
- " <td>373</td>\n",
- " <td>435</td>\n",
- " <td>2.814122</td>\n",
- " <td>2.100517</td>\n",
- " <td>14500</td>\n",
- " <td>14500</td>\n",
- " <td>4.747076e-08</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>41</th>\n",
- " <td>GeAeGaD</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>28</td>\n",
- " <td>373</td>\n",
- " <td>6204</td>\n",
- " <td>2.002286</td>\n",
- " <td>1.870643</td>\n",
- " <td>26500</td>\n",
- " <td>26500</td>\n",
- " <td>7.739905e-08</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>108</th>\n",
- " <td>GaDaGaD</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>6</td>\n",
- " <td>373</td>\n",
- " <td>280</td>\n",
- " <td>4.283209</td>\n",
- " <td>3.463896</td>\n",
- " <td>10100</td>\n",
- " <td>10100</td>\n",
- " <td>3.328533e-07</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>117</th>\n",
- " <td>GaDpSpD</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>6</td>\n",
- " <td>17</td>\n",
- " <td>25</td>\n",
- " <td>4.434438</td>\n",
- " <td>2.443015</td>\n",
- " <td>50500</td>\n",
- " <td>50498</td>\n",
- " <td>1.351195e-04</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>249</th>\n",
- " <td>GpPWpGaD</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>1</td>\n",
- " <td>373</td>\n",
- " <td>2</td>\n",
- " <td>3.687043</td>\n",
- " <td>1.467271</td>\n",
- " <td>107700</td>\n",
- " <td>100783</td>\n",
- " <td>7.459857e-04</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>12</th>\n",
- " <td>GaDrD</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>6</td>\n",
- " <td>5</td>\n",
- " <td>3</td>\n",
- " <td>5.138905</td>\n",
- " <td>3.917056</td>\n",
- " <td>90900</td>\n",
- " <td>16242</td>\n",
- " <td>2.361471e-03</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>107</th>\n",
- " <td>GaDrDrD</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>6</td>\n",
- " <td>5</td>\n",
- " <td>11</td>\n",
- " <td>4.850720</td>\n",
- " <td>2.435138</td>\n",
- " <td>90900</td>\n",
- " <td>83771</td>\n",
- " <td>3.827697e-03</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>102</th>\n",
- " <td>GaDlAlD</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>6</td>\n",
- " <td>33</td>\n",
- " <td>42</td>\n",
- " <td>3.744022</td>\n",
- " <td>2.730794</td>\n",
- " <td>10100</td>\n",
- " <td>10100</td>\n",
- " <td>6.445393e-03</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>166</th>\n",
- " <td>GcGiGdD</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>6</td>\n",
- " <td>45</td>\n",
- " <td>2</td>\n",
- " <td>4.111242</td>\n",
- " <td>2.415512</td>\n",
- " <td>57700</td>\n",
- " <td>14148</td>\n",
- " <td>1.770697e-02</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>133</th>\n",
- " <td>GdDpSpD</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>2</td>\n",
- " <td>17</td>\n",
- " <td>5</td>\n",
- " <td>3.383199</td>\n",
- " <td>1.939016</td>\n",
- " <td>532500</td>\n",
- " <td>504211</td>\n",
- " <td>2.750328e-02</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>185</th>\n",
- " <td>GiGuDrD</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>2</td>\n",
- " <td>5</td>\n",
- " <td>1</td>\n",
- " <td>1.130016</td>\n",
- " <td>2.068239</td>\n",
- " <td>1442700</td>\n",
- " <td>61314</td>\n",
- " <td>3.331929e-02</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>37</th>\n",
- " <td>GeAlDrD</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>28</td>\n",
- " <td>5</td>\n",
- " <td>18</td>\n",
- " <td>1.161357</td>\n",
- " <td>0.720075</td>\n",
- " <td>238500</td>\n",
- " <td>238489</td>\n",
- " <td>4.016257e-02</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>131</th>\n",
- " <td>GdDuGdD</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>2</td>\n",
- " <td>45</td>\n",
- " <td>4</td>\n",
- " <td>3.838754</td>\n",
- " <td>2.930357</td>\n",
- " <td>106500</td>\n",
- " <td>72110</td>\n",
- " <td>4.128653e-02</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " metapath source_name target_name source_degree target_degree \\\n",
- "57 GpBPpGaD FTO obesity 32 373 \n",
- "41 GeAeGaD FTO obesity 28 373 \n",
- "108 GaDaGaD FTO obesity 6 373 \n",
- "117 GaDpSpD FTO obesity 6 17 \n",
- "249 GpPWpGaD FTO obesity 1 373 \n",
- "12 GaDrD FTO obesity 6 5 \n",
- "107 GaDrDrD FTO obesity 6 5 \n",
- "102 GaDlAlD FTO obesity 6 33 \n",
- "166 GcGiGdD FTO obesity 6 45 \n",
- "133 GdDpSpD FTO obesity 2 17 \n",
- "185 GiGuDrD FTO obesity 2 5 \n",
- "37 GeAlDrD FTO obesity 28 5 \n",
- "131 GdDuGdD FTO obesity 2 45 \n",
- "\n",
- " path_count dwpc mean_nz n nnz p_value \n",
- "57 435 2.814122 2.100517 14500 14500 4.747076e-08 \n",
- "41 6204 2.002286 1.870643 26500 26500 7.739905e-08 \n",
- "108 280 4.283209 3.463896 10100 10100 3.328533e-07 \n",
- "117 25 4.434438 2.443015 50500 50498 1.351195e-04 \n",
- "249 2 3.687043 1.467271 107700 100783 7.459857e-04 \n",
- "12 3 5.138905 3.917056 90900 16242 2.361471e-03 \n",
- "107 11 4.850720 2.435138 90900 83771 3.827697e-03 \n",
- "102 42 3.744022 2.730794 10100 10100 6.445393e-03 \n",
- "166 2 4.111242 2.415512 57700 14148 1.770697e-02 \n",
- "133 5 3.383199 1.939016 532500 504211 2.750328e-02 \n",
- "185 1 1.130016 2.068239 1442700 61314 3.331929e-02 \n",
- "37 18 1.161357 0.720075 238500 238489 4.016257e-02 \n",
- "131 4 3.838754 2.930357 106500 72110 4.128653e-02 "
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Show nominally significant metapaths\n",
- "(\n",
- " metapath_df\n",
- " .sort_values('p_value')\n",
- " .query(\"p_value < 0.05\")\n",
- " [['metapath', 'source_name', 'target_name', 'source_degree', 'target_degree', 'path_count', 'dwpc', 'mean_nz', 'n', 'nnz', 'p_value']]\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>metapath</th>\n",
- " <th>source_name</th>\n",
- " <th>target_name</th>\n",
- " <th>source_degree</th>\n",
- " <th>target_degree</th>\n",
- " <th>path_count</th>\n",
- " <th>dwpc</th>\n",
- " <th>mean_nz</th>\n",
- " <th>n</th>\n",
- " <th>nnz</th>\n",
- " <th>p_value</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>91</th>\n",
- " <td>GuCpDrD</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>0</td>\n",
- " <td>5</td>\n",
- " <td>0</td>\n",
- " <td>0.0</td>\n",
- " <td>NaN</td>\n",
- " <td>15928200</td>\n",
- " <td>0</td>\n",
- " <td>1.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>92</th>\n",
- " <td>GuCtDrD</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>0</td>\n",
- " <td>5</td>\n",
- " <td>0</td>\n",
- " <td>0.0</td>\n",
- " <td>NaN</td>\n",
- " <td>15928200</td>\n",
- " <td>0</td>\n",
- " <td>1.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>93</th>\n",
- " <td>GuCbGaD</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>0</td>\n",
- " <td>373</td>\n",
- " <td>0</td>\n",
- " <td>0.0</td>\n",
- " <td>NaN</td>\n",
- " <td>1769800</td>\n",
- " <td>0</td>\n",
- " <td>1.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>95</th>\n",
- " <td>GuCbGuD</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>0</td>\n",
- " <td>74</td>\n",
- " <td>0</td>\n",
- " <td>0.0</td>\n",
- " <td>NaN</td>\n",
- " <td>1769800</td>\n",
- " <td>0</td>\n",
- " <td>1.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>251</th>\n",
- " <td>GpPWpGuD</td>\n",
- " <td>FTO</td>\n",
- " <td>obesity</td>\n",
- " <td>1</td>\n",
- " <td>74</td>\n",
- " <td>0</td>\n",
- " <td>0.0</td>\n",
- " <td>1.658241</td>\n",
- " <td>107700</td>\n",
- " <td>56287</td>\n",
- " <td>1.0</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " metapath source_name target_name source_degree target_degree \\\n",
- "91 GuCpDrD FTO obesity 0 5 \n",
- "92 GuCtDrD FTO obesity 0 5 \n",
- "93 GuCbGaD FTO obesity 0 373 \n",
- "95 GuCbGuD FTO obesity 0 74 \n",
- "251 GpPWpGuD FTO obesity 1 74 \n",
- "\n",
- " path_count dwpc mean_nz n nnz p_value \n",
- "91 0 0.0 NaN 15928200 0 1.0 \n",
- "92 0 0.0 NaN 15928200 0 1.0 \n",
- "93 0 0.0 NaN 1769800 0 1.0 \n",
- "95 0 0.0 NaN 1769800 0 1.0 \n",
- "251 0 0.0 1.658241 107700 56287 1.0 "
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Non-significant metapaths\n",
- "(\n",
- " metapath_df\n",
- " .sort_values('p_value')\n",
- " .tail()\n",
- " [['metapath', 'source_name', 'target_name', 'source_degree', 'target_degree', 'path_count', 'dwpc', 'mean_nz', 'n', 'nnz', 'p_value']]\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Hetionet Neo4j Queries"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "To create DWPC queries for other metapaths, you can use the following method:\n",
- "\n",
- "\n",
- "```python\n",
- "import hetio.neo4j\n",
- "metapath = hetmat.metagraph.get_metapath('GaDpSpD')\n",
- "query = hetio.neo4j.construct_dwpc_query(metapath)\n",
- "print(query)\n",
- "```\n",
- "\n",
- "\n",
- "## Top _GpBPpGaD_ paths\n",
- "\n",
- "```cypher\n",
- "MATCH path = (n0:Gene)-[:PARTICIPATES_GpBP]-(n1)-[:PARTICIPATES_GpBP]-(n2)-[:ASSOCIATES_DaG]-(n3:Disease)\n",
- "WHERE n0.name = 'FTO'\n",
- " AND n3.name = 'obesity'\n",
- "AND n0 <> n2\n",
- "WITH [\n",
- " size((n0)-[:PARTICIPATES_GpBP]-()),\n",
- " size(()-[:PARTICIPATES_GpBP]-(n1)),\n",
- " size((n1)-[:PARTICIPATES_GpBP]-()),\n",
- " size(()-[:PARTICIPATES_GpBP]-(n2)),\n",
- " size((n2)-[:ASSOCIATES_DaG]-()),\n",
- " size(()-[:ASSOCIATES_DaG]-(n3))\n",
- "] AS degrees, path\n",
- "RETURN\n",
- " path,\n",
- " substring(reduce(string = '', node IN nodes(path) | string + '—' + node.name), 1) AS nodes,\n",
- " reduce(pdp = 1.0, d in degrees | pdp * d ^ -0.5) AS pdp\n",
- "ORDER BY pdp DESC\n",
- "LIMIT 10\n",
- "```\n",
- "\n",
- "## Top _GeAeGaD_ paths\n",
- "\n",
- "```cypher\n",
- "MATCH path = (n0:Gene)-[:EXPRESSES_AeG]-(n1)-[:EXPRESSES_AeG]-(n2)-[:ASSOCIATES_DaG]-(n3:Disease)\n",
- "WHERE n0.name = 'FTO'\n",
- " AND n3.name = 'obesity'\n",
- "AND n0 <> n2\n",
- "WITH [\n",
- "size((n0)-[:EXPRESSES_AeG]-()),\n",
- "size(()-[:EXPRESSES_AeG]-(n1)),\n",
- "size((n1)-[:EXPRESSES_AeG]-()),\n",
- "size(()-[:EXPRESSES_AeG]-(n2)),\n",
- "size((n2)-[:ASSOCIATES_DaG]-()),\n",
- "size(()-[:ASSOCIATES_DaG]-(n3))\n",
- "] AS degrees, path\n",
- "RETURN\n",
- " path,\n",
- " substring(reduce(string = '', node IN nodes(path) | string + '—' + node.name), 1) AS nodes,\n",
- " reduce(pdp = 1.0, d in degrees | pdp * d ^ -0.5) AS pdp\n",
- "ORDER BY pdp DESC\n",
- "LIMIT 10\n",
- "```\n",
- "\n",
- "\n",
- "## Top _GaDpSpD_ paths\n",
- "\n",
- "```cypher\n",
- "MATCH path = (n0:Gene)-[:ASSOCIATES_DaG]-(n1)-[:PRESENTS_DpS]-(n2)-[:PRESENTS_DpS]-(n3:Disease)\n",
- "WHERE n0.name = 'FTO'\n",
- " AND n3.name = 'obesity'\n",
- "AND n1 <> n3\n",
- "WITH [\n",
- "size((n0)-[:ASSOCIATES_DaG]-()),\n",
- "size(()-[:ASSOCIATES_DaG]-(n1)),\n",
- "size((n1)-[:PRESENTS_DpS]-()),\n",
- "size(()-[:PRESENTS_DpS]-(n2)),\n",
- "size((n2)-[:PRESENTS_DpS]-()),\n",
- "size(()-[:PRESENTS_DpS]-(n3))\n",
- "] AS degrees, path\n",
- "RETURN\n",
- " path,\n",
- " substring(reduce(string = '', node IN nodes(path) | string + '—' + node.name), 1) AS nodes,\n",
- " reduce(pdp = 1.0, d in degrees | pdp * d ^ -0.5) AS pdp\n",
- "ORDER BY pdp DESC\n",
- "LIMIT 10\n",
- "```"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python [conda env:hetmech]",
- "language": "python",
- "name": "conda-env-hetmech-py"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.6"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Add Comment
Please, Sign In to add comment