Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Preamble dependencies\n",
- "\n",
- "from __future__ import print_function\n",
- "\n",
- "# Python 2 and 3 compatible unicode file opening\n",
- "\n",
- "from io import open"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Notebook Spring Cleaning\n",
- "\n",
- "I have a _lot_ of notebooks, many of which are named \"Untitled\\*.ipynb\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Untitled.ipynb\n",
- "Untitled1.ipynb\n",
- "Untitled10.ipynb\n",
- "Untitled11.ipynb\n",
- "Untitled12.ipynb\n",
- "Untitled13.ipynb\n",
- "Untitled14.ipynb\n",
- "Untitled15.ipynb\n",
- "Untitled16.ipynb\n",
- "Untitled17.ipynb\n",
- "Untitled18.ipynb\n",
- "Untitled19.ipynb\n",
- "Untitled2.ipynb\n",
- "Untitled20.ipynb\n",
- "Untitled21.ipynb\n",
- "Untitled22.ipynb\n",
- "Untitled23.ipynb\n",
- "Untitled24.ipynb\n",
- "Untitled25.ipynb\n",
- "Untitled26.ipynb\n",
- "Untitled27.ipynb\n",
- "Untitled28.ipynb\n",
- "Untitled29.ipynb\n",
- "Untitled3.ipynb\n",
- "Untitled30.ipynb\n",
- "Untitled31.ipynb\n",
- "Untitled32.ipynb\n",
- "Untitled33.ipynb\n",
- "Untitled34.ipynb\n",
- "Untitled35.ipynb\n",
- "Untitled36.ipynb\n",
- "Untitled37.ipynb\n",
- "Untitled38.ipynb\n",
- "Untitled39.ipynb\n",
- "Untitled4.ipynb\n",
- "Untitled40.ipynb\n",
- "Untitled41.ipynb\n",
- "Untitled42.ipynb\n",
- "Untitled43.ipynb\n",
- "Untitled44.ipynb\n",
- "Untitled45.ipynb\n",
- "Untitled46.ipynb\n",
- "Untitled47.ipynb\n",
- "Untitled48.ipynb\n",
- "Untitled49.ipynb\n",
- "Untitled5.ipynb\n",
- "Untitled50.ipynb\n",
- "Untitled51.ipynb\n",
- "Untitled52.ipynb\n",
- "Untitled53.ipynb\n",
- "Untitled54.ipynb\n",
- "Untitled55.ipynb\n",
- "Untitled56.ipynb\n",
- "Untitled57.ipynb\n",
- "Untitled58.ipynb\n",
- "Untitled59.ipynb\n",
- "Untitled6.ipynb\n",
- "Untitled60.ipynb\n",
- "Untitled61.ipynb\n",
- "Untitled62.ipynb\n",
- "Untitled63.ipynb\n",
- "Untitled64.ipynb\n",
- "Untitled65.ipynb\n",
- "Untitled66.ipynb\n",
- "Untitled67.ipynb\n",
- "Untitled68.ipynb\n",
- "Untitled69.ipynb\n",
- "Untitled7.ipynb\n",
- "Untitled70.ipynb\n",
- "Untitled71.ipynb\n",
- "Untitled72.ipynb\n",
- "Untitled73.ipynb\n",
- "Untitled74.ipynb\n",
- "Untitled75.ipynb\n",
- "Untitled76.ipynb\n",
- "Untitled77.ipynb\n",
- "Untitled78.ipynb\n",
- "Untitled79.ipynb\n",
- "Untitled8.ipynb\n",
- "Untitled80.ipynb\n",
- "Untitled81.ipynb\n",
- "Untitled82.ipynb\n",
- "Untitled83.ipynb\n",
- "Untitled84.ipynb\n",
- "Untitled85.ipynb\n",
- "Untitled86.ipynb\n",
- "Untitled87.ipynb\n",
- "Untitled9.ipynb\n"
- ]
- }
- ],
- "source": [
- "import glob\n",
- "\n",
- "for notebook_name in sorted(glob.glob(\"Untitled*.ipynb\")):\n",
- " print(notebook_name)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Jupyter provides a library called `nbformat` for working with notebooks in Python. We can use it to read and write notebooks."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "nbformat.notebooknode.NotebookNode"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import nbformat\n",
- "\n",
- "nb = nbformat.read(open(\"Untitled.ipynb\"), as_version=4)\n",
- "type(nb)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "dict_keys(['cells', 'metadata', 'nbformat', 'nbformat_minor'])"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "nb.keys()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "notebooks = [\n",
- " (notebook_name, nbformat.read(open(notebook_name), as_version=4))\n",
- " for notebook_name in glob.glob(\"Untitled*.ipynb\")\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Counter({1: 27, 2: 16, 3: 11, 4: 10, 6: 6, 0: 5, 5: 4, 8: 4, 9: 2, 11: 1, 7: 1, 17: 1}) \n",
- "\n",
- "5 notebook(s) with no cells and 27 notebook(s) with only one cell.\n"
- ]
- }
- ],
- "source": [
- "from collections import Counter\n",
- "\n",
- "c = Counter()\n",
- "\n",
- "for (name, notebook) in notebooks:\n",
- " c.update([len(notebook.cells)])\n",
- "\n",
- "print(c,\"\\n\")\n",
- "print(\"{} notebook(s) with no cells and {} notebook(s) with only one cell.\".format(c[0], c[1]))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Let's set up some simple heuristics for cleaning up notebooks.\n",
- "\n",
- "1. If it doesn't have any cells, delete it\n",
- "2. If it has a markdown cell, I probably mean to keep it\n",
- "3. If it has outputs, keep it"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "def should_delete_notebook(notebook):\n",
- " # Default to delete\n",
- " should_delete = True\n",
- " for cell in notebook.cells:\n",
- " # Let any with markdown pass through\n",
- " if cell.cell_type == 'markdown':\n",
- " should_delete = False\n",
- " # If there are any outputs, pass it through\n",
- " elif len(cell.outputs) > 0:\n",
- " should_delete = False\n",
- " return should_delete"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Don't delete Untitled7.ipynb\n",
- "DELETE Untitled43.ipynb\n",
- "Don't delete Untitled61.ipynb\n",
- "DELETE Untitled25.ipynb\n",
- "Don't delete Untitled52.ipynb\n",
- "DELETE Untitled16.ipynb\n",
- "Don't delete Untitled70.ipynb\n",
- "Don't delete Untitled34.ipynb\n",
- "Don't delete Untitled87.ipynb\n",
- "DELETE Untitled69.ipynb\n",
- "Don't delete Untitled78.ipynb\n",
- "Don't delete Untitled47.ipynb\n",
- "DELETE Untitled83.ipynb\n",
- "Don't delete Untitled65.ipynb\n",
- "Don't delete Untitled29.ipynb\n",
- "Don't delete Untitled56.ipynb\n",
- "DELETE Untitled74.ipynb\n",
- "DELETE Untitled38.ipynb\n",
- "Don't delete Untitled12.ipynb\n",
- "DELETE Untitled30.ipynb\n",
- "Don't delete Untitled21.ipynb\n",
- "Don't delete Untitled3.ipynb\n",
- "DELETE Untitled81.ipynb\n",
- "Don't delete Untitled45.ipynb\n",
- "Don't delete Untitled9.ipynb\n",
- "Don't delete Untitled27.ipynb\n",
- "Don't delete Untitled63.ipynb\n",
- "DELETE Untitled18.ipynb\n",
- "DELETE Untitled54.ipynb\n",
- "DELETE Untitled36.ipynb\n",
- "DELETE Untitled72.ipynb\n",
- "Don't delete Untitled10.ipynb\n",
- "Don't delete Untitled1.ipynb\n",
- "Don't delete Untitled85.ipynb\n",
- "Don't delete Untitled49.ipynb\n",
- "Don't delete Untitled67.ipynb\n",
- "DELETE Untitled58.ipynb\n",
- "DELETE Untitled76.ipynb\n",
- "DELETE Untitled14.ipynb\n",
- "Don't delete Untitled50.ipynb\n",
- "Don't delete Untitled32.ipynb\n",
- "Don't delete Untitled23.ipynb\n",
- "DELETE Untitled41.ipynb\n",
- "DELETE Untitled5.ipynb\n",
- "Don't delete Untitled62.ipynb\n",
- "DELETE Untitled26.ipynb\n",
- "Don't delete Untitled80.ipynb\n",
- "Don't delete Untitled44.ipynb\n",
- "Don't delete Untitled8.ipynb\n",
- "DELETE Untitled71.ipynb\n",
- "Don't delete Untitled35.ipynb\n",
- "DELETE Untitled17.ipynb\n",
- "Don't delete Untitled53.ipynb\n",
- "DELETE Untitled79.ipynb\n",
- "Don't delete Untitled.ipynb\n",
- "Don't delete Untitled66.ipynb\n",
- "Don't delete Untitled84.ipynb\n",
- "DELETE Untitled48.ipynb\n",
- "Don't delete Untitled75.ipynb\n",
- "Don't delete Untitled39.ipynb\n",
- "DELETE Untitled57.ipynb\n",
- "DELETE Untitled31.ipynb\n",
- "DELETE Untitled13.ipynb\n",
- "Don't delete Untitled4.ipynb\n",
- "Don't delete Untitled40.ipynb\n",
- "DELETE Untitled22.ipynb\n",
- "DELETE Untitled28.ipynb\n",
- "Don't delete Untitled64.ipynb\n",
- "DELETE Untitled46.ipynb\n",
- "DELETE Untitled82.ipynb\n",
- "Don't delete Untitled37.ipynb\n",
- "DELETE Untitled73.ipynb\n",
- "Don't delete Untitled55.ipynb\n",
- "Don't delete Untitled19.ipynb\n",
- "DELETE Untitled11.ipynb\n",
- "DELETE Untitled2.ipynb\n",
- "DELETE Untitled20.ipynb\n",
- "Don't delete Untitled68.ipynb\n",
- "Don't delete Untitled86.ipynb\n",
- "DELETE Untitled77.ipynb\n",
- "Don't delete Untitled59.ipynb\n",
- "Don't delete Untitled33.ipynb\n",
- "DELETE Untitled51.ipynb\n",
- "DELETE Untitled15.ipynb\n",
- "Don't delete Untitled42.ipynb\n",
- "Don't delete Untitled6.ipynb\n",
- "Don't delete Untitled60.ipynb\n",
- "DELETE Untitled24.ipynb\n"
- ]
- }
- ],
- "source": [
- "for (name, notebook) in notebooks:\n",
- " should_delete = should_delete_notebook(notebook)\n",
- " print(\"DELETE\" if should_delete else \"Don't delete\", name)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Time to _really_ delete those notebooks now."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Deleting Untitled43.ipynb\n",
- "Deleting Untitled25.ipynb\n",
- "Deleting Untitled16.ipynb\n",
- "Deleting Untitled69.ipynb\n",
- "Deleting Untitled83.ipynb\n",
- "Deleting Untitled74.ipynb\n",
- "Deleting Untitled38.ipynb\n",
- "Deleting Untitled30.ipynb\n",
- "Deleting Untitled81.ipynb\n",
- "Deleting Untitled18.ipynb\n",
- "Deleting Untitled54.ipynb\n",
- "Deleting Untitled36.ipynb\n",
- "Deleting Untitled72.ipynb\n",
- "Deleting Untitled58.ipynb\n",
- "Deleting Untitled76.ipynb\n",
- "Deleting Untitled14.ipynb\n",
- "Deleting Untitled41.ipynb\n",
- "Deleting Untitled5.ipynb\n",
- "Deleting Untitled26.ipynb\n",
- "Deleting Untitled71.ipynb\n",
- "Deleting Untitled17.ipynb\n",
- "Deleting Untitled79.ipynb\n",
- "Deleting Untitled48.ipynb\n",
- "Deleting Untitled57.ipynb\n",
- "Deleting Untitled31.ipynb\n",
- "Deleting Untitled13.ipynb\n",
- "Deleting Untitled22.ipynb\n",
- "Deleting Untitled28.ipynb\n",
- "Deleting Untitled46.ipynb\n",
- "Deleting Untitled82.ipynb\n",
- "Deleting Untitled73.ipynb\n",
- "Deleting Untitled11.ipynb\n",
- "Deleting Untitled2.ipynb\n",
- "Deleting Untitled20.ipynb\n",
- "Deleting Untitled77.ipynb\n",
- "Deleting Untitled51.ipynb\n",
- "Deleting Untitled15.ipynb\n",
- "Deleting Untitled24.ipynb\n"
- ]
- }
- ],
- "source": [
- "import os\n",
- "for (name, notebook) in notebooks:\n",
- " should_delete = should_delete_notebook(notebook)\n",
- " if(should_delete):\n",
- " print(\"Deleting {}\".format(name))\n",
- " os.unlink(name)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Let's see what we can determine from the rest of the notebooks here"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [],
- "source": [
- "notebooks = [\n",
- " (notebook_name, nbformat.read(open(notebook_name), as_version=4))\n",
- " for notebook_name in glob.glob(\"Untitled*.ipynb\")\n",
- "]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "On commuter (and nteract) this shows full blown navigable JSON objects for inspecting, which helped me determine I wanted to delete several notebooks."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/json": {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": "import os\n"
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": "'/data/tmp/genie/jobs/20170612_213051.010216.prodsparkshell200-spinnaker/genie/applications/spark200/dependencies/spark-2.0.0/python:/data/tmp/genie/jobs/20170612_213051.010216.prodsparkshell200-spinnaker/genie/applications/spark200/dependencies/spark-2.0.0/python/lib/py4j-0.10.1-src.zip'"
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": "os.environ['PYTHONPATH']"
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": ""
- }
- ],
- "metadata": {
- "hide_input": false,
- "kernelspec": {
- "display_name": "Python 2 (PySpark 2.0.0)",
- "language": "python",
- "name": "pyspark2"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 2
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython2",
- "version": "2.7.6"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- },
- "text/plain": [
- "<IPython.core.display.JSON object>"
- ]
- },
- "metadata": {
- "application/json": {
- "expanded": false
- }
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "from IPython.display import display, JSON\n",
- "\n",
- "display(JSON(notebooks[0][1]))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "As I've worked through these though, I realize that just seeing all the source is really helpful."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [],
- "source": [
- "import IPython.display as d\n",
- "import html\n",
- "\n",
- "def summarize(notebooks):\n",
- " for (name, notebook) in notebooks:\n",
- " d.display(d.HTML(\"<h1>{}</h1>\".format(name)))\n",
- " full_source = html.escape(\"\\n\".join([cell.source for cell in notebook.cells]))\n",
- "\n",
- " d.display(d.HTML(\"<pre>{}</pre>\".format(full_source)))\n",
- " d.display(d.HTML(\"<hr />\"))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {
- "hide_input": false,
- "scrolled": false
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "<h1>Untitled7.ipynb</h1>"
- ],
- "text/plain": [
- "<IPython.core.display.HTML object>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "<pre>import os\n",
- "\n",
- "os.environ['PYTHONPATH']\n",
- "</pre>"
- ],
- "text/plain": [
- "<IPython.core.display.HTML object>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "<hr />"
- ],
- "text/plain": [
- "<IPython.core.display.HTML object>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "<h1>Untitled61.ipynb</h1>"
- ],
- "text/plain": [
- "<IPython.core.display.HTML object>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "<pre>spark\n",
- "</pre>"
- ],
- "text/plain": [
- "<IPython.core.display.HTML object>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "<hr />"
- ],
- "text/plain": [
- "<IPython.core.display.HTML object>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "<h1>Untitled52.ipynb</h1>"
- ],
- "text/plain": [
- "<IPython.core.display.HTML object>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "<pre>from pyspark.sql import SparkSession\n",
- "\n",
- "spark = (\n",
- " SparkSession.builder.appName('PySpark Demo')\n",
- " .config("packages", "org.postgresql:postgresql:42.1.1")\n",
- " .getOrCreate()\n",
- ")\n",
- "spark\n",
- "spark\n",
- "</pre>"
- ],
- "text/plain": [
- "<IPython.core.display.HTML object>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "<hr />"
- ],
- "text/plain": [
- "<IPython.core.display.HTML object>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "summarize(notebooks[:3])\n",
- "\n",
- "# Note that you can have it summarize all of them with `summarize(notebooks)`\n",
- "# I left them out to keep a sanitary notebook for export"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [],
- "source": [
- "delete_me = [\n",
- " \"Untitled7.ipynb\",\n",
- " \"Untitled61.ipynb\",\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [],
- "source": [
- "rename_me = [\n",
- " (\"Untitled52.ipynb\", \"Load Postgres with PySpark.ipynb\")\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [],
- "source": [
- "import errno\n",
- "\n",
- "for name in delete_me:\n",
- " try:\n",
- " os.unlink(name)\n",
- " except Exception as e:\n",
- " if e.errno is errno.ENOENT:\n",
- " pass\n",
- " else:\n",
- " print(\"unable to delete {}\".format(name))\n",
- " print(e)\n",
- "\n",
- "for name, new_name in rename_me:\n",
- " try:\n",
- " os.rename(name, new_name)\n",
- " except Exception as e:\n",
- " if e.errno is errno.ENOENT:\n",
- " pass\n",
- " else:\n",
- " print(\"unable to rename {} to {}\".format(name, new_name))\n",
- " print(e)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "notebooks = [\n",
- " (notebook_name, nbformat.read(open(notebook_name), as_version=4))\n",
- " for notebook_name in glob.glob(\"Untitled*.ipynb\")\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {
- "scrolled": false
- },
- "outputs": [],
- "source": [
- "summarize(notebooks)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Notebooks all cleaned up!"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "hide_input": false,
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.2"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Add Comment
Please, Sign In to add comment