• API
• FAQ
• Tools
• Archive
daily pastebin goal
41%
SHARE
TWEET

# Untitled

a guest Sep 12th, 2018 55 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
1. {
2.  "cells": [
3.   {
4.    "cell_type": "markdown",
6.    "source": [
7.     "## Fit the Heights of Two Peaks\n",
8.     "\n",
9.     "Assume we have two components A and B -- a signal peak over a flat background -- and measure their sum. We know the shape and location of both, but we are interested in the relative contributions from either component:\n",
10.     "if we see 1000 events, how many of them are generated by process A and B, respectively?\n"
11.    ]
12.   },
13.   {
14.    "cell_type": "code",
15.    "execution_count": 1,
17.    "outputs": [],
18.    "source": [
19.     "import matplotlib.pyplot as plt\n",
20.     "import numpy as np\n",
21.     "from scipy import stats\n",
22.     "from statsmodels.base.model import GenericLikelihoodModel"
23.    ]
24.   },
25.   {
26.    "cell_type": "markdown",
28.    "source": [
29.     "Generate some pseudodata."
30.    ]
31.   },
32.   {
33.    "cell_type": "code",
34.    "execution_count": 2,
36.    "outputs": [],
37.    "source": [
38.     "np.random.seed(42)\n",
39.     "pdf_a = stats.halfcauchy(loc=0, scale=1)\n",
40.     "pdf_b = stats.uniform(loc=0, scale=100)\n",
41.     "\n",
42.     "n_a = 30\n",
43.     "n_b = 1000\n",
44.     "\n",
45.     "X = np.concatenate([\n",
46.     "    pdf_a.rvs(size=n_a),\n",
47.     "    pdf_b.rvs(size=n_b),\n",
48.     "])[:, np.newaxis]"
49.    ]
50.   },
51.   {
52.    "cell_type": "code",
53.    "execution_count": 3,
55.    "outputs": [
56.     {
57.      "data": {
59.       "text/plain": [
60.        "<Figure size 640x480 with 1 Axes>"
61.       ]
62.      },
64.       "image/png": {
65.        "height": 414,
66.        "width": 553
67.       }
68.      },
69.      "output_type": "display_data"
70.     }
71.    ],
72.    "source": [
73.     "plt.hist(X, bins='auto');"
74.    ]
75.   },
76.   {
77.    "cell_type": "markdown",
79.    "source": [
80.     "We use a custom MLE here, without exogenous variables."
81.    ]
82.   },
83.   {
84.    "cell_type": "code",
85.    "execution_count": 4,
87.    "outputs": [],
88.    "source": [
89.     "class TwoPeakLLH(GenericLikelihoodModel):\n",
90.     "    \"\"\"Fit height of signal peak over background.\"\"\"\n",
91.     "    start_params = [10, 1000]\n",
92.     "    cloneattr = ['start_params', 'signal', 'background']\n",
93.     "    exog_names = ['n_signal', 'n_background']\n",
94.     "    endog_names = ['alpha']\n",
95.     "    \n",
96.     "    def __init__(self, endog, exog=None, signal=None, background=None,\n",
97.     "                 *args, **kwargs):\n",
98.     "        # assume we know the shape + location of the two components,\n",
99.     "        # so we re-use their PDFs here\n",
100.     "        self.signal = signal\n",
101.     "        self.background = background\n",
102.     "        super(TwoPeakLLH, self).__init__(endog=endog, exog=exog, \n",
103.     "                                         *args, **kwargs)\n",
104.     "\n",
105.     "    def loglike(self, params):        # pylint: disable=E0202\n",
106.     "        return -self.nloglike(params)\n",
107.     "\n",
108.     "    def nloglike(self, params):\n",
109.     "        endog = self.endog\n",
110.     "        return self.nlnlike(params, endog)\n",
111.     "\n",
112.     "    def nlnlike(self, params, endog):\n",
113.     "        n_sig = params[0]\n",
114.     "        n_bkg = params[1]\n",
115.     "        if (n_sig < 0) or n_bkg < 0:\n",
116.     "            return np.inf\n",
117.     "        n_tot = n_bkg + n_sig\n",
118.     "        alpha = endog\n",
119.     "        sig = self.signal.pdf(alpha)\n",
120.     "        bkg = self.background.pdf(alpha)\n",
121.     "        sumlogl = np.sum(\n",
122.     "            np.ma.log(\n",
123.     "                (n_sig * sig) + (n_bkg * bkg)\n",
124.     "            )\n",
125.     "        )\n",
126.     "        sumlogl -= n_tot\n",
127.     "        return -sumlogl"
128.    ]
129.   },
130.   {
131.    "cell_type": "code",
132.    "execution_count": 5,
134.    "outputs": [],
135.    "source": [
136.     "model = TwoPeakLLH(endog=X, \n",
137.     "                   signal=pdf_a, \n",
138.     "                   background=pdf_b, \n",
139.     "                   )"
140.    ]
141.   },
142.   {
143.    "cell_type": "code",
144.    "execution_count": 6,
146.    "outputs": [
147.     {
148.      "name": "stdout",
149.      "output_type": "stream",
150.      "text": [
151.       "Optimization terminated successfully.\n",
152.       "         Current function value: -1.342181\n",
153.       "         Iterations: 75\n",
154.       "         Function evaluations: 143\n"
155.      ]
156.     }
157.    ],
158.    "source": [
159.     "res = model.fit()\n",
160.     "_ = res.bootstrap()"
161.    ]
162.   },
163.   {
164.    "cell_type": "code",
165.    "execution_count": 7,
167.    "outputs": [
168.     {
169.      "data": {
170.       "text/html": [
171.        "<table class=\"simpletable\">\n",
172.        "<caption>TwoPeakLLH Results</caption>\n",
173.        "<tr>\n",
174.        "  <th>Dep. Variable:</th>         <td>['alpha']</td>     <th>  Log-Likelihood:    </th> <td>  1382.4</td>\n",
175.        "</tr>\n",
176.        "<tr>\n",
177.        "  <th>Model:</th>                <td>TwoPeakLLH</td>     <th>  AIC:               </th> <td>     nan</td>\n",
178.        "</tr>\n",
179.        "<tr>\n",
180.        "  <th>Method:</th>           <td>Maximum Likelihood</td> <th>  BIC:               </th> <td>     nan</td>\n",
181.        "</tr>\n",
182.        "<tr>\n",
183.        "  <th>Date:</th>              <td>Tue, 22 May 2018</td>  <th>                     </th>     <td> </td>   \n",
184.        "</tr>\n",
185.        "<tr>\n",
186.        "  <th>Time:</th>                  <td>01:08:48</td>      <th>                     </th>     <td> </td>   \n",
187.        "</tr>\n",
188.        "<tr>\n",
189.        "  <th>No. Observations:</th>       <td>  1030</td>       <th>                     </th>     <td> </td>   \n",
190.        "</tr>\n",
191.        "<tr>\n",
192.        "  <th>Df Residuals:</th>           <td>   NaN</td>       <th>                     </th>     <td> </td>   \n",
193.        "</tr>\n",
194.        "<tr>\n",
195.        "  <th>Df Model:</th>               <td>   NaN</td>       <th>                     </th>     <td> </td>   \n",
196.        "</tr>\n",
197.        "</table>\n",
198.        "<table class=\"simpletable\">\n",
199.        "<tr>\n",
200.        "        <td></td>          <th>coef</th>     <th>std err</th>      <th>z</th>      <th>P>|z|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
201.        "</tr>\n",
202.        "<tr>\n",
203.        "  <th>n_signal</th>     <td>   31.7378</td> <td>    8.912</td> <td>    3.561</td> <td> 0.000</td> <td>   14.270</td> <td>   49.206</td>\n",
204.        "</tr>\n",
205.        "<tr>\n",
206.        "  <th>n_background</th> <td>  998.2622</td> <td>   32.341</td> <td>   30.867</td> <td> 0.000</td> <td>  934.875</td> <td> 1061.650</td>\n",
207.        "</tr>\n",
208.        "</table>"
209.       ],
210.       "text/plain": [
211.        "<class 'statsmodels.iolib.summary.Summary'>\n",
212.        "\"\"\"\n",
213.        "                              TwoPeakLLH Results                              \n",
214.        "==============================================================================\n",
215.        "Dep. Variable:              ['alpha']   Log-Likelihood:                 1382.4\n",
216.        "Model:                     TwoPeakLLH   AIC:                               nan\n",
217.        "Method:            Maximum Likelihood   BIC:                               nan\n",
218.        "Date:                Tue, 22 May 2018                                         \n",
219.        "Time:                        01:08:48                                         \n",
220.        "No. Observations:                1030                                         \n",
221.        "Df Residuals:                     NaN                                         \n",
222.        "Df Model:                         NaN                                         \n",
223.        "================================================================================\n",
224.        "                   coef    std err          z      P>|z|      [0.025      0.975]\n",
225.        "--------------------------------------------------------------------------------\n",
226.        "n_signal        31.7378      8.912      3.561      0.000      14.270      49.206\n",
227.        "n_background   998.2622     32.341     30.867      0.000     934.875    1061.650\n",
228.        "================================================================================\n",
229.        "\"\"\""
230.       ]
231.      },
232.      "execution_count": 7,
234.      "output_type": "execute_result"
235.     }
236.    ],
237.    "source": [
238.     "res.summary()"
239.    ]
240.   },
241.   {
242.    "cell_type": "code",
243.    "execution_count": null,
245.    "outputs": [],
246.    "source": []
247.   }
248.  ],
250.   "kernelspec": {
251.    "display_name": "Python 3",
252.    "language": "python",
253.    "name": "python3"
254.   },
255.   "language_info": {
256.    "codemirror_mode": {
257.     "name": "ipython",
258.     "version": 3
259.    },
260.    "file_extension": ".py",
261.    "mimetype": "text/x-python",
262.    "name": "python",
263.    "nbconvert_exporter": "python",
264.    "pygments_lexer": "ipython3",
265.    "version": "3.6.5"
266.   }
267.  },
268.  "nbformat": 4,
269.  "nbformat_minor": 2
270. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy.

Top