Untitled

{
 "cells": [
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "import csv\nimport shlex\nfile2 = open(\"/Users/zainab/Dropbox/DREU/Code/data/conditions-master.csv\", \"r\", encoding='utf-8', errors='ignore')\nreader2 = csv.reader(file2, delimiter=',')\ncond_list = {}\nfor row in reader2:\n    cond = row[0]\n    count = row[1]\n    if len(cond) > 0:\n        cond_list[cond]=count\nfile2.close()",
   "execution_count": 3,
   "outputs": []
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "len(cond_list)",
   "execution_count": 4,
   "outputs": [
    {
     "output_type": "execute_result",
     "execution_count": 4,
     "data": {
      "text/plain": "69136"
     },
     "metadata": {}
    }
   ]
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "import os\npath_ann = \"/Users/zainab/Desktop/ebm_nlp/annotations/hierarchical_labels/participants/\"\npath_abs = \"/Users/zainab/Desktop/ebm_nlp/documents/\"\nabstracts = []\ngold = []\npmids = []\n\nfor filepath in os.listdir(path_ann):\n    if filepath.endswith('DAWID_SKENE.ann'):\n        pmid = filepath.strip('_DAWID_SKENE.ann')\n        pmids.append(pmid)\n        labels = open(path_ann+filepath).read().split(',')\n        for abs_filepath in os.listdir(path_abs):\n            if abs_filepath == pmid+\".text\":\n                abstract = open(path_abs+abs_filepath).read()\n                abstracts.append(abstract)\n            if abs_filepath == pmid+\".tokens\":\n                tokens = open(path_abs+abs_filepath).read().split(' ')\n                if len(labels) == len(tokens):\n                    ann = []\n                    s = \"\"\n                    for i in range(len(tokens)):\n                        if labels[i] == \"4\":\n                            s = s+tokens[i]+\" \"\n                        if labels[i] != \"4\":\n                            if len(s)>0:\n                                ann.append(s.strip())\n                                s = \"\"\n                    gold.append(ann)",
   "execution_count": 5,
   "outputs": []
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "import os\npath_ann = \"/Users/zainab/Desktop/ebm_nlp/annotations/hierarchical_labels/participants_experts/\"\npath_abs = \"/Users/zainab/Desktop/ebm_nlp/documents/\"\nabstracts_2 = []\ngold_2 = []\npmids = []\n\nfor filepath in os.listdir(path_ann):\n    if filepath.endswith('DAWID_SKENE.ann'):\n        pmid = filepath.strip('_DAWID_SKENE.ann')\n        pmids.append(pmid)\n        labels = open(path_ann+filepath).read().split(',')\n        for abs_filepath in os.listdir(path_abs):\n            if abs_filepath == pmid+\".text\":\n                abstract = open(path_abs+abs_filepath).read()\n                abstracts_2.append(abstract)\n            if abs_filepath == pmid+\".tokens\":\n                tokens = open(path_abs+abs_filepath).read().split(' ')\n                if len(labels) == len(tokens):\n                    ann = []\n                    s = \"\"\n                    for i in range(len(tokens)):\n                        if labels[i] == \"4\":\n                            s = s+tokens[i]+\" \"\n                        if labels[i] != \"4\":\n                            if len(s)>0:\n                                ann.append(s.strip())\n                                s = \"\"\n                    gold_2.append(ann)",
   "execution_count": 107,
   "outputs": []
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "long_predictions = []\nfor abstract in abstracts_2:\n    abstract = abstract.lower()\n    anns = []\n    for key in cond_list.keys():\n        if key in abstract:\n            anns.append(key)\n    long_predictions.append(anns)",
   "execution_count": 164,
   "outputs": []
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "print(\"Abstract: \", abstracts[8])\nprint(\"What is annotated: \", gold[8])\nprint(\"What we found: \", long_predictions[8])",
   "execution_count": null,
   "outputs": []
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "import itertools\n# collapse gold list \ngold_merged_2 = list(itertools.chain.from_iterable(gold_2))",
   "execution_count": 165,
   "outputs": []
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "# full match\ncounter = 0\nfor term in gold_merged_2:\n    if term in cond_list.keys():\n        counter+= 1",
   "execution_count": 169,
   "outputs": []
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "counter/len(gold_merged_2)",
   "execution_count": 170,
   "outputs": [
    {
     "output_type": "execute_result",
     "execution_count": 170,
     "data": {
      "text/plain": "0.4726277372262774"
     },
     "metadata": {}
    }
   ]
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "# partial match\ncounter_partial = 0\nfor term in gold_merged_2:\n    for key in cond_list.keys():\n        if term in key:\n            counter_partial+= 1\n            break;",
   "execution_count": 172,
   "outputs": []
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "counter_partial/len(gold_merged_2)",
   "execution_count": 173,
   "outputs": [
    {
     "output_type": "execute_result",
     "execution_count": 173,
     "data": {
      "text/plain": "0.5291970802919708"
     },
     "metadata": {}
    }
   ]
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "#at paper level full match\natleast_one = 0\nfor condition in gold_2:\n    boo = False\n    for cond in condition:\n        if cond in cond_list.keys():\n            boo = True\n    if boo == True:\n        atleast_one+=1",
   "execution_count": 174,
   "outputs": []
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "atleast_one/len(gold_2)",
   "execution_count": 176,
   "outputs": [
    {
     "output_type": "execute_result",
     "execution_count": 176,
     "data": {
      "text/plain": "0.705"
     },
     "metadata": {}
    }
   ]
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "#at paper level partial match\natleast_one_p = 0\nfor condition in gold_2:\n    boo = False\n    for cond in condition:\n        for key in cond_list.keys():\n            if cond in key:\n                boo = True\n    if boo == True:\n        atleast_one_p+=1",
   "execution_count": 178,
   "outputs": []
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "atleast_one_p/len(gold_2)",
   "execution_count": 179,
   "outputs": [
    {
     "output_type": "execute_result",
     "execution_count": 179,
     "data": {
      "text/plain": "0.755"
     },
     "metadata": {}
    }
   ]
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "#find example where there isn't a full match\nno_match = []\nfor i in range(len(gold_2)):\n    boo = False\n    condition = gold_2[i]\n    for cond in condition:\n        if cond in cond_list.keys():\n            boo = True\n    if boo == False:\n        no_match.append(i)",
   "execution_count": 180,
   "outputs": []
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "no_match[0:20]",
   "execution_count": 181,
   "outputs": [
    {
     "output_type": "execute_result",
     "execution_count": 181,
     "data": {
      "text/plain": "[5, 9, 14, 22, 24, 27, 28, 35, 39, 42, 44, 53, 54, 55, 59, 63, 72, 75, 76, 83]"
     },
     "metadata": {}
    }
   ]
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "k = 3\nprint(\"Abstract: \", abstracts[k])\nprint(\"What is annotated: \", gold[k])\nprint(\"What we found: \", long_predictions[k])",
   "execution_count": 214,
   "outputs": [
    {
     "output_type": "stream",
     "text": "Abstract:  In vitro biocompatibility tests of glass ionomer cements impregnated with collagen or bioactive glass to fibroblasts.\n\nAIM AND DESIGN To evaluate the biocompatibility of glass ionomer cement (GIC) impregnated with collagen or bioactive glass to BHK-21 fibroblasts in vitro. Mineral Trioxide Aggregate was used as the standard for comparison. Human maxillary central incisors (n = 70) were instrumented with a rotary NiTi system and filled. Following resection of the apical 3mm, root end cavities were prepared and restored with conventional GIC (group 1) or GIC with 0.01%, 0.1% or 1% collagen (groups 2, 3, 4 respectively) or, 10%, 30% or 50% bioactive glass (groups 5, 6, 7 respectively), or Mineral Trioxide Aggregate (group 8). The root slices were incubated in tissue culture plates with BHK-21 fibroblast cell line. Phase contrast and scanning electron microscopes were used to score cell quantity, morphology and cell attachment. The data were statistically analyzed by one way ANOVA with Post Hoc Tukey HSD test (p = 0.05).\nRESULTS AND CONCLUSIONS Group 5 showed the highest scores which was significantly higher than all other groups (p < 0.05) except group 8, with which there was no significant difference (p > 0.05). Glass ionomer cement with 10% bioactive glass showed better adhesion and spreading of cells than glass ionomer cement with 0.01% collagen. The biocompatibility of collagen and bioactive glass was concentration dependent. The addition of bioactive glass improved the biocompatibility of glass ionomer cement to fibroblasts better than addition of collagen.\n\n\nWhat is annotated:  ['Human maxillary central incisors', 'root end cavities', 'root slices']\nWhat we found:  ['healthy', 'aged', 'healthy adults', 'behavior', 'adult', 'diet', 'osa', 'health', 'cognitive performance', 'attention', 'cerebral blood flow', 'men', 'control', 'performance', 'outcome', 'age', 'brain function', 'healthy young adults', 'art', 'placebo', 'ect', 'hemoglobin', 'outcomes', 'behavioral', 'brain', 'supplementation', 'healthy adult', 'adults', 'young adult', 'sah', 'ad', 'ct', 'ph', 'ami', 'blood flow', 'fatty acids', 'osah', 'young adults', 'function', 'acc', 'stent', 'young', 'ra', 'dietary supplement', 'dietary supplementation', 'ms', 'pe', 'blood', 'cts', 'hemodynamic', 'hemodynamic response', 'par', 'iol', 'near-infrared spectroscopy', 'healthy young', 'ich', 'oic', 'blind', 'spect', 'ed', 'act', 'fatty acid', 'concentration', 'he', 'ifi', 'controlled', 'supplement', 'computer', 'er', 'ain', 'prefrontal cortex', 'ic', 'double-blind', 'alt', 'omega-3 polyunsaturated fatty acids', 'mpa', 'ee', 'mic', 'polyunsaturated fatty acids', 'as', 'os']\n",
     "name": "stdout"
    }
   ]
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "no_match_p = []\nfor i in range(len(gold)):\n    boo = False\n    condition = gold[i]\n    for cond in condition:\n        for key in cond_list.keys():\n            if cond in key:\n                boo = True\n    if boo == False and len(condition) > 0:\n        no_match_p.append(i)",
   "execution_count": 189,
   "outputs": [
    {
     "output_type": "error",
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-189-7ee41a36ec56>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0mcond\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcondition\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcond_list\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m             \u001b[0;32mif\u001b[0m \u001b[0mcond\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      8\u001b[0m                 \u001b[0mboo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mboo\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;32mFalse\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcondition\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ]
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "print(len(no_match_p))\nprint(len(no_match))\nprint(no_match_p[0:30])",
   "execution_count": 196,
   "outputs": [
    {
     "output_type": "stream",
     "text": "180\n59\n[3, 12, 14, 16, 24, 36, 44, 48, 58, 61, 62, 67, 68, 69, 95, 96, 99, 100, 103, 105, 107, 110, 122, 124, 126, 132, 133, 136, 138, 141]\n",
     "name": "stdout"
    }
   ]
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "k = 141\nprint(\"Abstract: \", abstracts[k])\nprint(\"What is annotated: \", gold[k])\nprint(\"What we found: \", long_predictions[k])",
   "execution_count": 201,
   "outputs": [
    {
     "output_type": "stream",
     "text": "Abstract:  Randomized Phase II trial assessing estramustine and vinblastine combination chemotherapy vs estramustine alone in patients with progressive hormone-escaped metastatic prostate cancer.\n\nBased on the results of combined data from three North American Phase II studies, a randomised Phase II study in the same patient population was performed, using combination chemotherapy with estramustine phosphate (EMP) and vinblastine (VBL) in hormone refractory prostate cancer patients. In all, 92 patients were randomised into a Phase II study of oral EMP (10 mg kg day continuously) or oral EMP in combination with intravenous VBL (4 mg m(2) week for 6 weeks, followed by 2 weeks rest). The end points were toxicity and PSA response in both groups, with the option to continue the trial as a Phase III study with time to progression and survival as end points, if sufficient responses were observed. Toxicity was unexpectedly high in both treatment arms and led to treatment withdrawal or refusal in 49% of all patients, predominantly already during the first treatment cycle. The mean treatment duration was 10 and 14 weeks, median time to PSA progression was 27.2 and 30.8 weeks, median survival time was 44 and 50.9 weeks, and PSA response rate was only 24.6 and 28.9% in the EMP/VBL and EMP arms, respectively. There was no correlation between PSA response and survival. While the PSA response in the patients tested was less than half that recorded in the North American studies, the toxicity of EMP monotherapy or in combination with VBL was much higher than expected. Further research on more effective and less toxic treatment strategies for hormone refractory prostate cancer is mandatory.\n\n\nWhat is annotated:  ['progressive hormone-escaped metastatic prostate cancer']\nWhat we found:  ['hepatitis c', 'infection', 'chronic hepatitis c', 'hepatitis', 'treatment', 'hcv', 'clinical trial', 'tia', 'chronic hepatitis', 'men', 'outcome', 'age', 'therapy', 'chronic', 'patients', 'genotype', 'genotype 1', 'ect', 'uti', 'outcomes', 'exposure', 'ct', 'ph', 'ami', 'acc', 'ra', 'pe', 'cts', 'practice', 'drug', 'sid', 'spect', 'achievement', 'tic', 'ed', 'net', 'act', 'ict', 'he', 'ifi', 'clinical', 'trial', 'tha', 'type 1', 'interferon alpha', 'er', 'ain', 'ic', 'stai', 'rop', 'mpa', 'tis', 'ee', 'chi', 'mic', 'evaluated', 'naive patients', 'as', 'os']\n",
     "name": "stdout"
    }
   ]
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "len(long_predictions_2)",
   "execution_count": 127,
   "outputs": [
    {
     "output_type": "execute_result",
     "execution_count": 127,
     "data": {
      "text/plain": "0"
     },
     "metadata": {}
    }
   ]
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "cond_list['pathology']",
   "execution_count": 147,
   "outputs": [
    {
     "output_type": "execute_result",
     "execution_count": 147,
     "data": {
      "text/plain": "'13'"
     },
     "metadata": {}
    }
   ]
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "\n",
   "execution_count": 163,
   "outputs": [
    {
     "output_type": "execute_result",
     "execution_count": 163,
     "data": {
      "text/plain": "False"
     },
     "metadata": {}
    }
   ]
  },
  {
   "metadata": {
    "trusted": true
   },
   "cell_type": "code",
   "source": "",
   "execution_count": null,
   "outputs": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3",
   "language": "python"
  },
  "language_info": {
   "name": "python",
   "version": "3.6.0",
   "mimetype": "text/x-python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "pygments_lexer": "ipython3",
   "nbconvert_exporter": "python",
   "file_extension": ".py"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}