{
 "metadata": {
  "name": "",
  "signature": "sha256:abf94f54fa884488179ceded93c241aef597992a2f7ed5204ebb44d90f973709"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Python Parsing Puzzle\n",
      "Posed here:\n",
      "\n",
      "https://mail.python.org/pipermail/centraloh/2014-December/002232.html"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from __future__ import print_function\n",
      "import re\n",
      "import numpy as np\n",
      "import pandas as pd"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Let's look at the source text, which shows a common pattern betwen `H1`, `H2`, `H3`, and `H4`"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "origtext = open('formulas.txt').read()\n",
      "print(origtext)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "81.137644601433 + 36.6589616861417 * H1 + -31.1046441228118 * H2+ -\n",
        "28.816467210923 * H3 + 29.9428662798989 * H4\n",
        "\n",
        "H1:\n",
        "\n",
        "TanH(0.5 * ((-0.93608461826401) + 0.0279759444862288 * A + 0.00317168579532788 *B + -0.00427397729136767 * C + 0.0125193801427099 * D +0.00721746180478183 * E + 0.0170333918733729 * F + 0.39879852406802* G + -0.608563852112721 * H + 1.65930260987635 * J +\n",
        "1.43483965025228 * K + -0.402472411054028 * L +\n",
        "0.574879148093717 * M))\n",
        "\n",
        "H2:\n",
        "\n",
        "TanH(0.5 * (2.54086700956048 + -0.0309791525829621 * A + 0.00237540217041462 *B + 0.000229714345399431 * C + 0.00656732772813456 * D +0.00858189985455511 * E + 0.0306145052833575 * F + 0.325324436920013* G + -0.0652380713439368 * H + 0.502808777611107 * J +\n",
        "2.62515706116404 * K + 0.157259327153552 * L + -\n",
        "0.0161835383305806 * M))\n",
        "\n",
        "H3:\n",
        "\n",
        "TanH(0.5 * ((-0.215510423022572) + 0.00341969653419575 * A + 0.00595421019457033 * B + 0.00208464969350523 * C + 0.0439352627700078 * D + -0.00692214398088429 * E + 0.0651828126561487 * F + 0.688661878742455\n",
        " * G + -1.18547032625557 * H + 2.44391733654282 * J +\n",
        "1.26094801313471 * K + -0.504334460563991 * L + -\n",
        "0.43207595594702 * M))\n",
        "\n",
        "H4:\n",
        "\n",
        "TanH(0.5 * ((-3.1962875744613) + 0.0274843861132957 * A + 0.00431941802031792 *B + 0.000873073437785473 * C + 0.036096877808041 * D + -0.00616748059025476 * E + 0.0410646957742769 * F + 0.321067469333745 * G + -0.344778527751441 * H + 0.335721855268919 * J +\n",
        "2.01028981590329 * K + 0.462469794510566 * L +\n",
        "0.279409768455546 * M))\n",
        "\n",
        "\n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Extract coefficient data using regular expressions\n",
      "Since the formulas follow the same pattern, we can use regular expressions to extract the data that we want.  Let's do this in three steps:\n",
      "\n",
      "1. separate the `H#` equations\n",
      "2. get the coefficients for each `H#` equation\n",
      "3. get the coefficients for the final equation (the formula at the beginning)\n",
      "\n",
      "Once accomplished, it will be trivial to save the coefficient data to csv files (or similar) that can be reviewed, including loaded into Excel to validate the calculations."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# replace newlines with spaces for easier regex matching\n",
      "text = origtext.replace('\\n', ' ')\n",
      "\n",
      "# 1. match two groups (key-val pairs); for example, \"H1\" and the equation excluding \"Tanh(0.5 *\"\n",
      "equations = dict(re.findall('(H\\d): +TanH.0.5[^(]+([^M]+M\\)*)', text))\n",
      "equations"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 3,
       "text": [
        "{'H1': '((-0.93608461826401) + 0.0279759444862288 * A + 0.00317168579532788 *B + -0.00427397729136767 * C + 0.0125193801427099 * D +0.00721746180478183 * E + 0.0170333918733729 * F + 0.39879852406802* G + -0.608563852112721 * H + 1.65930260987635 * J + 1.43483965025228 * K + -0.402472411054028 * L + 0.574879148093717 * M))',\n",
        " 'H2': '(2.54086700956048 + -0.0309791525829621 * A + 0.00237540217041462 *B + 0.000229714345399431 * C + 0.00656732772813456 * D +0.00858189985455511 * E + 0.0306145052833575 * F + 0.325324436920013* G + -0.0652380713439368 * H + 0.502808777611107 * J + 2.62515706116404 * K + 0.157259327153552 * L + - 0.0161835383305806 * M))',\n",
        " 'H3': '((-0.215510423022572) + 0.00341969653419575 * A + 0.00595421019457033 * B + 0.00208464969350523 * C + 0.0439352627700078 * D + -0.00692214398088429 * E + 0.0651828126561487 * F + 0.688661878742455  * G + -1.18547032625557 * H + 2.44391733654282 * J + 1.26094801313471 * K + -0.504334460563991 * L + - 0.43207595594702 * M))',\n",
        " 'H4': '((-3.1962875744613) + 0.0274843861132957 * A + 0.00431941802031792 *B + 0.000873073437785473 * C + 0.036096877808041 * D + -0.00616748059025476 * E + 0.0410646957742769 * F + 0.321067469333745 * G + -0.344778527751441 * H + 0.335721855268919 * J + 2.01028981590329 * K + 0.462469794510566 * L + 0.279409768455546 * M))'}"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Now let's get the coefficient data, which includes a single intercept and 12 variable coefficients for each `H#` equation."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# construct the regex's and make sure they work on an equation\n",
      "matchfloat = '-? *\\d*\\.\\d*'\n",
      "\n",
      "# match the intercept value\n",
      "matchintercept = '({matchfloat}) *\\)? *\\+'.format(matchfloat=matchfloat)\n",
      "# match both the coefficient value and the variable name [A-M]\n",
      "matchvariable = '({matchfloat}) *\\* *([A-M])'.format(matchfloat=matchfloat)\n",
      "\n",
      "print('Intercept:')\n",
      "print(re.findall(matchintercept, equations['H1']))\n",
      "print('Coefficients:')\n",
      "re.findall(matchvariable, equations['H1'])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Intercept:\n",
        "['-0.93608461826401']\n",
        "Coefficients:\n"
       ]
      },
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 4,
       "text": [
        "[(' 0.0279759444862288', 'A'),\n",
        " (' 0.00317168579532788', 'B'),\n",
        " ('-0.00427397729136767', 'C'),\n",
        " (' 0.0125193801427099', 'D'),\n",
        " ('0.00721746180478183', 'E'),\n",
        " (' 0.0170333918733729', 'F'),\n",
        " (' 0.39879852406802', 'G'),\n",
        " ('-0.608563852112721', 'H'),\n",
        " (' 1.65930260987635', 'J'),\n",
        " (' 1.43483965025228', 'K'),\n",
        " ('-0.402472411054028', 'L'),\n",
        " (' 0.574879148093717', 'M')]"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "The coefficients above look like key-value pairs.  Let's make dict of `{varname: coeff}` and convert `coeff` to float.\n",
      "\n",
      "Use `'intercept'` as the key for the intercept value in the dict.\n",
      "\n",
      "As a sanity check, also verify there is only one intercept and exactly 12 variables extracted by the regex."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def tofloat(sval):\n",
      "    return float(sval.replace(' ', ''))\n",
      "\n",
      "def parsevars(eqtext):\n",
      "    d = dict(map(reversed, re.findall(matchvariable, eqtext)))\n",
      "    assert len(d) == 12\n",
      "    intercept = re.findall(matchintercept, eqtext)\n",
      "    assert len(intercept) == 1\n",
      "    d['intercept'] = intercept[0]\n",
      "    d = dict((key, tofloat(val))\n",
      "             for key, val in d.items())\n",
      "    return d\n",
      "\n",
      "parsevars(equations['H1'])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 5,
       "text": [
        "{'A': 0.0279759444862288,\n",
        " 'B': 0.00317168579532788,\n",
        " 'C': -0.00427397729136767,\n",
        " 'D': 0.0125193801427099,\n",
        " 'E': 0.00721746180478183,\n",
        " 'F': 0.0170333918733729,\n",
        " 'G': 0.39879852406802,\n",
        " 'H': -0.608563852112721,\n",
        " 'J': 1.65930260987635,\n",
        " 'K': 1.43483965025228,\n",
        " 'L': -0.402472411054028,\n",
        " 'M': 0.574879148093717,\n",
        " 'intercept': -0.93608461826401}"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Perfect!\n",
      "\n",
      "Now do this for all the `H#` equations and construct a `pandas` DataFrame of coefficients."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "eq_coeff = pd.DataFrame(map(parsevars, equations.values()), index=equations.keys()).sort()\n",
      "eq_coeff.T  # too wide for the screen, so take the transpose"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>H1</th>\n",
        "      <th>H2</th>\n",
        "      <th>H3</th>\n",
        "      <th>H4</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>A</th>\n",
        "      <td> 0.027976</td>\n",
        "      <td>-0.030979</td>\n",
        "      <td> 0.003420</td>\n",
        "      <td> 0.027484</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>B</th>\n",
        "      <td> 0.003172</td>\n",
        "      <td> 0.002375</td>\n",
        "      <td> 0.005954</td>\n",
        "      <td> 0.004319</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>C</th>\n",
        "      <td>-0.004274</td>\n",
        "      <td> 0.000230</td>\n",
        "      <td> 0.002085</td>\n",
        "      <td> 0.000873</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>D</th>\n",
        "      <td> 0.012519</td>\n",
        "      <td> 0.006567</td>\n",
        "      <td> 0.043935</td>\n",
        "      <td> 0.036097</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>E</th>\n",
        "      <td> 0.007217</td>\n",
        "      <td> 0.008582</td>\n",
        "      <td>-0.006922</td>\n",
        "      <td>-0.006167</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>F</th>\n",
        "      <td> 0.017033</td>\n",
        "      <td> 0.030615</td>\n",
        "      <td> 0.065183</td>\n",
        "      <td> 0.041065</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>G</th>\n",
        "      <td> 0.398799</td>\n",
        "      <td> 0.325324</td>\n",
        "      <td> 0.688662</td>\n",
        "      <td> 0.321067</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>H</th>\n",
        "      <td>-0.608564</td>\n",
        "      <td>-0.065238</td>\n",
        "      <td>-1.185470</td>\n",
        "      <td>-0.344779</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>J</th>\n",
        "      <td> 1.659303</td>\n",
        "      <td> 0.502809</td>\n",
        "      <td> 2.443917</td>\n",
        "      <td> 0.335722</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>K</th>\n",
        "      <td> 1.434840</td>\n",
        "      <td> 2.625157</td>\n",
        "      <td> 1.260948</td>\n",
        "      <td> 2.010290</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>L</th>\n",
        "      <td>-0.402472</td>\n",
        "      <td> 0.157259</td>\n",
        "      <td>-0.504334</td>\n",
        "      <td> 0.462470</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>M</th>\n",
        "      <td> 0.574879</td>\n",
        "      <td>-0.016184</td>\n",
        "      <td>-0.432076</td>\n",
        "      <td> 0.279410</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>intercept</th>\n",
        "      <td>-0.936085</td>\n",
        "      <td> 2.540867</td>\n",
        "      <td>-0.215510</td>\n",
        "      <td>-3.196288</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 6,
       "text": [
        "                 H1        H2        H3        H4\n",
        "A          0.027976 -0.030979  0.003420  0.027484\n",
        "B          0.003172  0.002375  0.005954  0.004319\n",
        "C         -0.004274  0.000230  0.002085  0.000873\n",
        "D          0.012519  0.006567  0.043935  0.036097\n",
        "E          0.007217  0.008582 -0.006922 -0.006167\n",
        "F          0.017033  0.030615  0.065183  0.041065\n",
        "G          0.398799  0.325324  0.688662  0.321067\n",
        "H         -0.608564 -0.065238 -1.185470 -0.344779\n",
        "J          1.659303  0.502809  2.443917  0.335722\n",
        "K          1.434840  2.625157  1.260948  2.010290\n",
        "L         -0.402472  0.157259 -0.504334  0.462470\n",
        "M          0.574879 -0.016184 -0.432076  0.279410\n",
        "intercept -0.936085  2.540867 -0.215510 -3.196288"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Now that the coefficients are in a DataFrame, we can easily save the data to csv, excel, hdf5, a SQL database, etc.  Another advantage of using a DataFrame is the equations can be calculated easily and efficiently.\n",
      "\n",
      "The final regex operation below extracts the coefficients used in the final equation.  Here again is the text we are matching against:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "text[:text.index('H1:')]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 7,
       "text": [
        "'81.137644601433 + 36.6589616861417 * H1 + -31.1046441228118 * H2+ - 28.816467210923 * H3 + 29.9428662798989 * H4  '"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "coeff = re.findall(\n",
      "    '({matchfloat}){plus}'\n",
      "    '({matchfloat}){times}H1{plus}'\n",
      "    '({matchfloat}){times}H2{plus}'\n",
      "    '({matchfloat}){times}H3{plus}'\n",
      "    '({matchfloat}){times}H4'\n",
      "    .format(matchfloat=matchfloat, plus=' *\\+ *', times=' *\\* *'),\n",
      "    text)\n",
      "\n",
      "coeff"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 8,
       "text": [
        "[('81.137644601433',\n",
        "  '36.6589616861417',\n",
        "  '-31.1046441228118',\n",
        "  '- 28.816467210923',\n",
        "  '29.9428662798989')]"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Perfect!  Now load this to a `pandas` Series and convert the values to float"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "H_coeff = pd.Series(map(tofloat, coeff[0]),\n",
      "                    index=['intercept', 'H1', 'H2', 'H3', 'H4'])\n",
      "H_coeff"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 9,
       "text": [
        "intercept    81.137645\n",
        "H1           36.658962\n",
        "H2          -31.104644\n",
        "H3          -28.816467\n",
        "H4           29.942866\n",
        "dtype: float64"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Perform the calculations\n",
      "\n",
      "We weren't given sample input, so let's make up our own input for demonstration"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "input_series = 0.1 + 0.1 * pd.Series(range(12), index=eq_coeff.columns[:-1])  # exclude 'intercept'\n",
      "input_series"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 10,
       "text": [
        "A    0.1\n",
        "B    0.2\n",
        "C    0.3\n",
        "D    0.4\n",
        "E    0.5\n",
        "F    0.6\n",
        "G    0.7\n",
        "H    0.8\n",
        "J    0.9\n",
        "K    1.0\n",
        "L    1.1\n",
        "M    1.2\n",
        "dtype: float64"
       ]
      }
     ],
     "prompt_number": 10
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Let's walk through the calculation one step at a time and inspect the results.  We'll make the calculation cleaner and more flexible at the very end."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# calculate the value within the \"0.5*(<value>)\" parentheses\n",
      "X = eq_coeff[input_series.index].dot(input_series) + eq_coeff['intercept']\n",
      "X"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 11,
       "text": [
        "H1    2.052557\n",
        "H2    5.970386\n",
        "H3    1.760772\n",
        "H4   -0.051047\n",
        "dtype: float64"
       ]
      }
     ],
     "prompt_number": 11
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# for sanity, let's look at the individual contributions of each factor\n",
      "debug = eq_coeff[input_series.index] * input_series\n",
      "debug['intercept'] = eq_coeff['intercept']\n",
      "debug.T"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>H1</th>\n",
        "      <th>H2</th>\n",
        "      <th>H3</th>\n",
        "      <th>H4</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>A</th>\n",
        "      <td> 0.002798</td>\n",
        "      <td>-0.003098</td>\n",
        "      <td> 0.000342</td>\n",
        "      <td> 0.002748</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>B</th>\n",
        "      <td> 0.000634</td>\n",
        "      <td> 0.000475</td>\n",
        "      <td> 0.001191</td>\n",
        "      <td> 0.000864</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>C</th>\n",
        "      <td>-0.001282</td>\n",
        "      <td> 0.000069</td>\n",
        "      <td> 0.000625</td>\n",
        "      <td> 0.000262</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>D</th>\n",
        "      <td> 0.005008</td>\n",
        "      <td> 0.002627</td>\n",
        "      <td> 0.017574</td>\n",
        "      <td> 0.014439</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>E</th>\n",
        "      <td> 0.003609</td>\n",
        "      <td> 0.004291</td>\n",
        "      <td>-0.003461</td>\n",
        "      <td>-0.003084</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>F</th>\n",
        "      <td> 0.010220</td>\n",
        "      <td> 0.018369</td>\n",
        "      <td> 0.039110</td>\n",
        "      <td> 0.024639</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>G</th>\n",
        "      <td> 0.279159</td>\n",
        "      <td> 0.227727</td>\n",
        "      <td> 0.482063</td>\n",
        "      <td> 0.224747</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>H</th>\n",
        "      <td>-0.486851</td>\n",
        "      <td>-0.052190</td>\n",
        "      <td>-0.948376</td>\n",
        "      <td>-0.275823</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>J</th>\n",
        "      <td> 1.493372</td>\n",
        "      <td> 0.452528</td>\n",
        "      <td> 2.199526</td>\n",
        "      <td> 0.302150</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>K</th>\n",
        "      <td> 1.434840</td>\n",
        "      <td> 2.625157</td>\n",
        "      <td> 1.260948</td>\n",
        "      <td> 2.010290</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>L</th>\n",
        "      <td>-0.442720</td>\n",
        "      <td> 0.172985</td>\n",
        "      <td>-0.554768</td>\n",
        "      <td> 0.508717</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>M</th>\n",
        "      <td> 0.689855</td>\n",
        "      <td>-0.019420</td>\n",
        "      <td>-0.518491</td>\n",
        "      <td> 0.335292</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>intercept</th>\n",
        "      <td>-0.936085</td>\n",
        "      <td> 2.540867</td>\n",
        "      <td>-0.215510</td>\n",
        "      <td>-3.196288</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 12,
       "text": [
        "                 H1        H2        H3        H4\n",
        "A          0.002798 -0.003098  0.000342  0.002748\n",
        "B          0.000634  0.000475  0.001191  0.000864\n",
        "C         -0.001282  0.000069  0.000625  0.000262\n",
        "D          0.005008  0.002627  0.017574  0.014439\n",
        "E          0.003609  0.004291 -0.003461 -0.003084\n",
        "F          0.010220  0.018369  0.039110  0.024639\n",
        "G          0.279159  0.227727  0.482063  0.224747\n",
        "H         -0.486851 -0.052190 -0.948376 -0.275823\n",
        "J          1.493372  0.452528  2.199526  0.302150\n",
        "K          1.434840  2.625157  1.260948  2.010290\n",
        "L         -0.442720  0.172985 -0.554768  0.508717\n",
        "M          0.689855 -0.019420 -0.518491  0.335292\n",
        "intercept -0.936085  2.540867 -0.215510 -3.196288"
       ]
      }
     ],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# for kicks, let's use this debug DataFrame to perform a consistenty check\n",
      "X.equals(debug.sum(axis=1))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 13,
       "text": [
        "True"
       ]
      }
     ],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# continuing with the calculation...\n",
      "TANH_X = np.tanh(0.5 * X)\n",
      "TANH_X"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 14,
       "text": [
        "H1    0.772411\n",
        "H2    0.994906\n",
        "H3    0.706613\n",
        "H4   -0.025518\n",
        "dtype: float64"
       ]
      }
     ],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# and take the dot product again just as we did above\n",
      "result = H_coeff[TANH_X.index].dot(TANH_X) + H_coeff['intercept']\n",
      "result"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 15,
       "text": [
        "57.381069996551943"
       ]
      }
     ],
     "prompt_number": 15
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Streamline the calculation\n",
      "Make the expressions more human-readable by introducing a helper function `dotcoeff`.  While we're at it, let's support both `pandas` Series and DataFrames as input.  By adding DataFrame support, the user can perform the calculation for multiple inputs simultaneously."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def dotcoeff(coeff, data):\n",
      "    \"\"\"Perform the dot product of coeff and data, and add coeff['intercept'].\n",
      "    \n",
      "    Use the index (or columns) of the input data Series (or DataFrame).\n",
      "    \"\"\"\n",
      "    return coeff[data.T.index].dot(data.T).T + coeff['intercept']\n",
      "\n",
      "def docalc(data):\n",
      "    X = dotcoeff(eq_coeff, data)\n",
      "    TANH_X = np.tanh(0.5 * X)\n",
      "    return dotcoeff(H_coeff, TANH_X)\n",
      "\n",
      "# Make sure we still get the same result with Series input\n",
      "docalc(input_series)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 16,
       "text": [
        "57.381069996551943"
       ]
      }
     ],
     "prompt_number": 16
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Now try DataFrame input\n",
      "input_df = pd.DataFrame([input_series, 2*input_series])\n",
      "input_df"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>A</th>\n",
        "      <th>B</th>\n",
        "      <th>C</th>\n",
        "      <th>D</th>\n",
        "      <th>E</th>\n",
        "      <th>F</th>\n",
        "      <th>G</th>\n",
        "      <th>H</th>\n",
        "      <th>J</th>\n",
        "      <th>K</th>\n",
        "      <th>L</th>\n",
        "      <th>M</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td> 0.1</td>\n",
        "      <td> 0.2</td>\n",
        "      <td> 0.3</td>\n",
        "      <td> 0.4</td>\n",
        "      <td> 0.5</td>\n",
        "      <td> 0.6</td>\n",
        "      <td> 0.7</td>\n",
        "      <td> 0.8</td>\n",
        "      <td> 0.9</td>\n",
        "      <td> 1</td>\n",
        "      <td> 1.1</td>\n",
        "      <td> 1.2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> 0.2</td>\n",
        "      <td> 0.4</td>\n",
        "      <td> 0.6</td>\n",
        "      <td> 0.8</td>\n",
        "      <td> 1.0</td>\n",
        "      <td> 1.2</td>\n",
        "      <td> 1.4</td>\n",
        "      <td> 1.6</td>\n",
        "      <td> 1.8</td>\n",
        "      <td> 2</td>\n",
        "      <td> 2.2</td>\n",
        "      <td> 2.4</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 17,
       "text": [
        "     A    B    C    D    E    F    G    H    J  K    L    M\n",
        "0  0.1  0.2  0.3  0.4  0.5  0.6  0.7  0.8  0.9  1  1.1  1.2\n",
        "1  0.2  0.4  0.6  0.8  1.0  1.2  1.4  1.6  1.8  2  2.2  2.4"
       ]
      }
     ],
     "prompt_number": 17
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "docalc(input_df)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 18,
       "text": [
        "0    57.381070\n",
        "1    86.097694\n",
        "dtype: float64"
       ]
      }
     ],
     "prompt_number": 18
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "And we're done! (well, I'm done anyway)\n",
      "\n",
      "Hopefully any revisions of the coefficients will be done within a data store (such as csv or database) so we won't need to parse them again using regular expressions."
     ]
    }
   ],
   "metadata": {}
  }
 ]
}