Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "autoscroll": false,
- "collapsed": false,
- "ein.hycell": false,
- "ein.tags": "worksheet-0",
- "slideshow": {
- "slide_type": "-"
- }
- },
- "outputs": [],
- "source": [
- "with open('textfile.txt') as f:\n",
- " txt = f.read()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 60,
- "metadata": {
- "autoscroll": false,
- "collapsed": false,
- "ein.hycell": false,
- "ein.tags": "worksheet-0",
- "slideshow": {
- "slide_type": "-"
- }
- },
- "outputs": [],
- "source": [
- "from re import sub\n",
- "\n",
- "# Get rid of all the \\n characters. Everything should just be a space!\n",
- "txt = ' '.join(txt.split())\n",
- "\n",
- "# Substitute all ? and ! characters for full stops.\n",
- "txt = sub('[?|!]', '.', txt)\n",
- "\n",
- "# Let's make everything lowercase\n",
- "txt = txt.lower()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 61,
- "metadata": {
- "autoscroll": false,
- "collapsed": false,
- "ein.hycell": false,
- "ein.tags": "worksheet-0",
- "slideshow": {
- "slide_type": "-"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "30"
- ]
- },
- "execution_count": 61,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Get number of sentences\n",
- "\n",
- "len(txt.split('.'))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 89,
- "metadata": {
- "autoscroll": false,
- "collapsed": false,
- "ein.hycell": false,
- "ein.tags": "worksheet-0",
- "slideshow": {
- "slide_type": "-"
- }
- },
- "outputs": [],
- "source": [
- "# Let's get rid of all puncuation (semicolons, commas, hyphens, etc.) to create simple words\n",
- "# to do this we remove everything that's not an alphanumeric character, and replace it with a space\n",
- "# (thus splitting hyphenated words into two and removing apostrophes.)\n",
- "\n",
- "words = sub('[^a-z0-9]', ' ', txt).split()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 90,
- "metadata": {
- "autoscroll": false,
- "collapsed": false,
- "ein.hycell": false,
- "ein.tags": "worksheet-0",
- "slideshow": {
- "slide_type": "-"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "565"
- ]
- },
- "execution_count": 90,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(words)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 91,
- "metadata": {
- "autoscroll": false,
- "collapsed": false,
- "ein.hycell": false,
- "ein.tags": "worksheet-0",
- "slideshow": {
- "slide_type": "-"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'why': 1,\n 'do': 1,\n 'people': 1,\n 'use': 2,\n 'python': 23,\n 'because': 2,\n 'there': 5,\n 'are': 4,\n 'many': 3,\n 'programming': 6,\n 'languages': 3,\n 'available': 1,\n 'today': 2,\n 'this': 4,\n 'is': 9,\n 'the': 16,\n 'usual': 1,\n 'first': 2,\n 'question': 2,\n 'of': 14,\n 'newcomers': 1,\n 'given': 1,\n 'that': 2,\n 'roughly': 2,\n '1': 1,\n 'million': 1,\n 'users': 3,\n 'out': 1,\n 'at': 1,\n 'moment': 1,\n 'really': 1,\n 'no': 1,\n 'way': 1,\n 'to': 13,\n 'answer': 1,\n 'with': 7,\n 'complete': 1,\n 'accuracy': 1,\n 'choice': 1,\n 'development': 2,\n 'tools': 4,\n 'sometimes': 1,\n 'based': 2,\n 'on': 4,\n 'unique': 1,\n 'constraints': 1,\n 'or': 3,\n 'personal': 1,\n 'preference': 1,\n 'but': 1,\n 'after': 2,\n 'teaching': 1,\n '225': 1,\n 'groups': 1,\n 'and': 22,\n 'over': 4,\n '3': 1,\n '000': 1,\n 'students': 1,\n 'during': 1,\n 'last': 1,\n '12': 1,\n 'years': 1,\n 'some': 2,\n 'common': 1,\n 'themes': 1,\n 'have': 1,\n 'emerged': 1,\n 'primary': 1,\n 'factors': 2,\n 'cited': 1,\n 'by': 2,\n 'seem': 1,\n 'be': 7,\n 'these': 2,\n 'software': 4,\n 'quality': 3,\n 'for': 6,\n 's': 4,\n 'focus': 1,\n 'readability': 1,\n 'coherence': 1,\n 'in': 6,\n 'general': 1,\n 'sets': 1,\n 'it': 5,\n 'apart': 1,\n 'from': 3,\n 'other': 3,\n 'scripting': 3,\n 'world': 1,\n 'code': 7,\n 'designed': 1,\n 'readable': 1,\n 'hence': 1,\n 'reusable': 1,\n 'maintainable': 1,\n 'much': 2,\n 'more': 6,\n 'so': 1,\n 'than': 2,\n 'traditional': 1,\n 'uniformity': 1,\n 'makes': 1,\n 'easy': 1,\n 'understand': 1,\n 'even': 2,\n 'if': 1,\n 'you': 1,\n 'did': 1,\n 'not': 2,\n 'write': 1,\n 'addition': 2,\n 'has': 2,\n 'deep': 1,\n 'support': 3,\n 'advanced': 1,\n 'reuse': 1,\n 'mechanisms': 2,\n 'such': 4,\n 'as': 8,\n 'object': 1,\n 'oriented': 1,\n 'oop': 1,\n 'developer': 2,\n 'productivity': 4,\n 'boosts': 1,\n 'times': 1,\n 'beyond': 1,\n 'compiled': 1,\n 'statically': 1,\n 'typed': 1,\n 'c': 7,\n 'java': 3,\n 'typically': 1,\n 'one': 2,\n 'third': 3,\n 'fifth': 1,\n 'size': 1,\n 'equivalent': 2,\n 'means': 1,\n 'less': 3,\n 'type': 1,\n 'debug': 1,\n 'maintain': 1,\n 'fact': 1,\n 'programs': 4,\n 'also': 1,\n 'run': 2,\n 'immediately': 1,\n 'without': 1,\n 'lengthy': 1,\n 'compile': 1,\n 'link': 1,\n 'steps': 1,\n 'required': 1,\n 'further': 1,\n 'boosting': 1,\n 'programmer': 1,\n 'speed': 1,\n 'program': 2,\n 'portability': 1,\n 'most': 3,\n 'unchanged': 1,\n 'all': 1,\n 'major': 1,\n 'computer': 1,\n 'platforms': 1,\n 'porting': 1,\n 'between': 2,\n 'linux': 1,\n 'windows': 1,\n 'example': 1,\n 'usually': 1,\n 'just': 1,\n 'a': 8,\n 'matter': 1,\n 'copying': 1,\n 'script': 1,\n 'machines': 1,\n 'moreover': 1,\n 'offers': 2,\n 'multiple': 1,\n 'options': 1,\n 'coding': 1,\n 'portable': 3,\n 'graphical': 1,\n 'user': 1,\n 'interfaces': 3,\n 'database': 1,\n 'access': 2,\n 'web': 1,\n 'systems': 1,\n 'operating': 1,\n 'system': 2,\n 'including': 1,\n 'launches': 1,\n 'directory': 1,\n 'processing': 1,\n 'they': 1,\n 'can': 10,\n 'possibly': 1,\n 'libraries': 3,\n 'comes': 1,\n 'large': 1,\n 'collection': 2,\n 'prebuilt': 1,\n 'functionality': 1,\n 'known': 1,\n 'standard': 1,\n 'library': 2,\n 'supports': 1,\n 'an': 4,\n 'array': 1,\n 'application': 3,\n 'level': 1,\n 'tasks': 1,\n 'text': 1,\n 'pattern': 1,\n 'matching': 1,\n 'network': 1,\n 'extended': 1,\n 'both': 1,\n 'homegrown': 1,\n 'vast': 1,\n 'party': 2,\n 'domain': 1,\n 'website': 1,\n 'construction': 1,\n 'numeric': 2,\n 'serial': 2,\n 'port': 1,\n 'game': 1,\n 'numpy': 1,\n 'extension': 2,\n 'instance': 1,\n 'been': 1,\n 'described': 1,\n 'free': 1,\n 'powerful': 1,\n 'matlab': 1,\n 'component': 1,\n 'integration': 2,\n 'scripts': 1,\n 'easily': 1,\n 'communicate': 2,\n 'parts': 1,\n 'using': 1,\n 'variety': 1,\n 'integrations': 1,\n 'allow': 1,\n 'used': 1,\n 'product': 1,\n 'customization': 1,\n 'tool': 2,\n 'invoke': 1,\n 'called': 1,\n 'integrate': 1,\n 'net': 1,\n 'components': 1,\n 'frameworks': 1,\n 'com': 1,\n 'interface': 1,\n 'devices': 1,\n 'ports': 1,\n 'interact': 1,\n 'networks': 1,\n 'like': 1,\n 'soap': 1,\n 'xml': 1,\n 'rpc': 1,\n 'corba': 1,\n 'standalone': 1,\n 'enjoyment': 1,\n 'ease': 1,\n 'built': 1,\n 'toolset': 1,\n 'make': 1,\n 'act': 1,\n 'pleasure': 1,\n 'chore': 1,\n 'although': 1,\n 'may': 1,\n 'intangible': 1,\n 'benefit': 1,\n 'its': 1,\n 'effect': 1,\n 'important': 1,\n 'asset': 1,\n 'two': 1,\n 'probably': 1,\n 'compelling': 1,\n 'benefits': 1}"
- ]
- },
- "execution_count": 91,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Now let's count the unique words!\n",
- "\n",
- "counts = {}\n",
- "\n",
- "for w in words:\n",
- " if counts.get(w):\n",
- " counts[w] += 1\n",
- " else:\n",
- " counts[w] = 1\n",
- "\n",
- "counts"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "name": "python3"
- },
- "name": "Untitled1.ipynb"
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Add Comment
Please, Sign In to add comment