Guest User

Untitled

a guest
Jan 21st, 2019
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.64 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {
  7. "autoscroll": false,
  8. "collapsed": false,
  9. "ein.hycell": false,
  10. "ein.tags": "worksheet-0",
  11. "slideshow": {
  12. "slide_type": "-"
  13. }
  14. },
  15. "outputs": [],
  16. "source": [
  17. "with open('textfile.txt') as f:\n",
  18. " txt = f.read()"
  19. ]
  20. },
  21. {
  22. "cell_type": "code",
  23. "execution_count": 60,
  24. "metadata": {
  25. "autoscroll": false,
  26. "collapsed": false,
  27. "ein.hycell": false,
  28. "ein.tags": "worksheet-0",
  29. "slideshow": {
  30. "slide_type": "-"
  31. }
  32. },
  33. "outputs": [],
  34. "source": [
  35. "from re import sub\n",
  36. "\n",
  37. "# Get rid of all the \\n characters. Everything should just be a space!\n",
  38. "txt = ' '.join(txt.split())\n",
  39. "\n",
  40. "# Substitute all ? and ! characters for full stops.\n",
  41. "txt = sub('[?|!]', '.', txt)\n",
  42. "\n",
  43. "# Let's make everything lowercase\n",
  44. "txt = txt.lower()"
  45. ]
  46. },
  47. {
  48. "cell_type": "code",
  49. "execution_count": 61,
  50. "metadata": {
  51. "autoscroll": false,
  52. "collapsed": false,
  53. "ein.hycell": false,
  54. "ein.tags": "worksheet-0",
  55. "slideshow": {
  56. "slide_type": "-"
  57. }
  58. },
  59. "outputs": [
  60. {
  61. "data": {
  62. "text/plain": [
  63. "30"
  64. ]
  65. },
  66. "execution_count": 61,
  67. "metadata": {},
  68. "output_type": "execute_result"
  69. }
  70. ],
  71. "source": [
  72. "# Get number of sentences\n",
  73. "\n",
  74. "len(txt.split('.'))"
  75. ]
  76. },
  77. {
  78. "cell_type": "code",
  79. "execution_count": 89,
  80. "metadata": {
  81. "autoscroll": false,
  82. "collapsed": false,
  83. "ein.hycell": false,
  84. "ein.tags": "worksheet-0",
  85. "slideshow": {
  86. "slide_type": "-"
  87. }
  88. },
  89. "outputs": [],
  90. "source": [
  91. "# Let's get rid of all puncuation (semicolons, commas, hyphens, etc.) to create simple words\n",
  92. "# to do this we remove everything that's not an alphanumeric character, and replace it with a space\n",
  93. "# (thus splitting hyphenated words into two and removing apostrophes.)\n",
  94. "\n",
  95. "words = sub('[^a-z0-9]', ' ', txt).split()"
  96. ]
  97. },
  98. {
  99. "cell_type": "code",
  100. "execution_count": 90,
  101. "metadata": {
  102. "autoscroll": false,
  103. "collapsed": false,
  104. "ein.hycell": false,
  105. "ein.tags": "worksheet-0",
  106. "slideshow": {
  107. "slide_type": "-"
  108. }
  109. },
  110. "outputs": [
  111. {
  112. "data": {
  113. "text/plain": [
  114. "565"
  115. ]
  116. },
  117. "execution_count": 90,
  118. "metadata": {},
  119. "output_type": "execute_result"
  120. }
  121. ],
  122. "source": [
  123. "len(words)"
  124. ]
  125. },
  126. {
  127. "cell_type": "code",
  128. "execution_count": 91,
  129. "metadata": {
  130. "autoscroll": false,
  131. "collapsed": false,
  132. "ein.hycell": false,
  133. "ein.tags": "worksheet-0",
  134. "slideshow": {
  135. "slide_type": "-"
  136. }
  137. },
  138. "outputs": [
  139. {
  140. "data": {
  141. "text/plain": [
  142. "{'why': 1,\n 'do': 1,\n 'people': 1,\n 'use': 2,\n 'python': 23,\n 'because': 2,\n 'there': 5,\n 'are': 4,\n 'many': 3,\n 'programming': 6,\n 'languages': 3,\n 'available': 1,\n 'today': 2,\n 'this': 4,\n 'is': 9,\n 'the': 16,\n 'usual': 1,\n 'first': 2,\n 'question': 2,\n 'of': 14,\n 'newcomers': 1,\n 'given': 1,\n 'that': 2,\n 'roughly': 2,\n '1': 1,\n 'million': 1,\n 'users': 3,\n 'out': 1,\n 'at': 1,\n 'moment': 1,\n 'really': 1,\n 'no': 1,\n 'way': 1,\n 'to': 13,\n 'answer': 1,\n 'with': 7,\n 'complete': 1,\n 'accuracy': 1,\n 'choice': 1,\n 'development': 2,\n 'tools': 4,\n 'sometimes': 1,\n 'based': 2,\n 'on': 4,\n 'unique': 1,\n 'constraints': 1,\n 'or': 3,\n 'personal': 1,\n 'preference': 1,\n 'but': 1,\n 'after': 2,\n 'teaching': 1,\n '225': 1,\n 'groups': 1,\n 'and': 22,\n 'over': 4,\n '3': 1,\n '000': 1,\n 'students': 1,\n 'during': 1,\n 'last': 1,\n '12': 1,\n 'years': 1,\n 'some': 2,\n 'common': 1,\n 'themes': 1,\n 'have': 1,\n 'emerged': 1,\n 'primary': 1,\n 'factors': 2,\n 'cited': 1,\n 'by': 2,\n 'seem': 1,\n 'be': 7,\n 'these': 2,\n 'software': 4,\n 'quality': 3,\n 'for': 6,\n 's': 4,\n 'focus': 1,\n 'readability': 1,\n 'coherence': 1,\n 'in': 6,\n 'general': 1,\n 'sets': 1,\n 'it': 5,\n 'apart': 1,\n 'from': 3,\n 'other': 3,\n 'scripting': 3,\n 'world': 1,\n 'code': 7,\n 'designed': 1,\n 'readable': 1,\n 'hence': 1,\n 'reusable': 1,\n 'maintainable': 1,\n 'much': 2,\n 'more': 6,\n 'so': 1,\n 'than': 2,\n 'traditional': 1,\n 'uniformity': 1,\n 'makes': 1,\n 'easy': 1,\n 'understand': 1,\n 'even': 2,\n 'if': 1,\n 'you': 1,\n 'did': 1,\n 'not': 2,\n 'write': 1,\n 'addition': 2,\n 'has': 2,\n 'deep': 1,\n 'support': 3,\n 'advanced': 1,\n 'reuse': 1,\n 'mechanisms': 2,\n 'such': 4,\n 'as': 8,\n 'object': 1,\n 'oriented': 1,\n 'oop': 1,\n 'developer': 2,\n 'productivity': 4,\n 'boosts': 1,\n 'times': 1,\n 'beyond': 1,\n 'compiled': 1,\n 'statically': 1,\n 'typed': 1,\n 'c': 7,\n 'java': 3,\n 'typically': 1,\n 'one': 2,\n 'third': 3,\n 'fifth': 1,\n 'size': 1,\n 'equivalent': 2,\n 'means': 1,\n 'less': 3,\n 'type': 1,\n 'debug': 1,\n 'maintain': 1,\n 'fact': 1,\n 'programs': 4,\n 'also': 1,\n 'run': 2,\n 'immediately': 1,\n 'without': 1,\n 'lengthy': 1,\n 'compile': 1,\n 'link': 1,\n 'steps': 1,\n 'required': 1,\n 'further': 1,\n 'boosting': 1,\n 'programmer': 1,\n 'speed': 1,\n 'program': 2,\n 'portability': 1,\n 'most': 3,\n 'unchanged': 1,\n 'all': 1,\n 'major': 1,\n 'computer': 1,\n 'platforms': 1,\n 'porting': 1,\n 'between': 2,\n 'linux': 1,\n 'windows': 1,\n 'example': 1,\n 'usually': 1,\n 'just': 1,\n 'a': 8,\n 'matter': 1,\n 'copying': 1,\n 'script': 1,\n 'machines': 1,\n 'moreover': 1,\n 'offers': 2,\n 'multiple': 1,\n 'options': 1,\n 'coding': 1,\n 'portable': 3,\n 'graphical': 1,\n 'user': 1,\n 'interfaces': 3,\n 'database': 1,\n 'access': 2,\n 'web': 1,\n 'systems': 1,\n 'operating': 1,\n 'system': 2,\n 'including': 1,\n 'launches': 1,\n 'directory': 1,\n 'processing': 1,\n 'they': 1,\n 'can': 10,\n 'possibly': 1,\n 'libraries': 3,\n 'comes': 1,\n 'large': 1,\n 'collection': 2,\n 'prebuilt': 1,\n 'functionality': 1,\n 'known': 1,\n 'standard': 1,\n 'library': 2,\n 'supports': 1,\n 'an': 4,\n 'array': 1,\n 'application': 3,\n 'level': 1,\n 'tasks': 1,\n 'text': 1,\n 'pattern': 1,\n 'matching': 1,\n 'network': 1,\n 'extended': 1,\n 'both': 1,\n 'homegrown': 1,\n 'vast': 1,\n 'party': 2,\n 'domain': 1,\n 'website': 1,\n 'construction': 1,\n 'numeric': 2,\n 'serial': 2,\n 'port': 1,\n 'game': 1,\n 'numpy': 1,\n 'extension': 2,\n 'instance': 1,\n 'been': 1,\n 'described': 1,\n 'free': 1,\n 'powerful': 1,\n 'matlab': 1,\n 'component': 1,\n 'integration': 2,\n 'scripts': 1,\n 'easily': 1,\n 'communicate': 2,\n 'parts': 1,\n 'using': 1,\n 'variety': 1,\n 'integrations': 1,\n 'allow': 1,\n 'used': 1,\n 'product': 1,\n 'customization': 1,\n 'tool': 2,\n 'invoke': 1,\n 'called': 1,\n 'integrate': 1,\n 'net': 1,\n 'components': 1,\n 'frameworks': 1,\n 'com': 1,\n 'interface': 1,\n 'devices': 1,\n 'ports': 1,\n 'interact': 1,\n 'networks': 1,\n 'like': 1,\n 'soap': 1,\n 'xml': 1,\n 'rpc': 1,\n 'corba': 1,\n 'standalone': 1,\n 'enjoyment': 1,\n 'ease': 1,\n 'built': 1,\n 'toolset': 1,\n 'make': 1,\n 'act': 1,\n 'pleasure': 1,\n 'chore': 1,\n 'although': 1,\n 'may': 1,\n 'intangible': 1,\n 'benefit': 1,\n 'its': 1,\n 'effect': 1,\n 'important': 1,\n 'asset': 1,\n 'two': 1,\n 'probably': 1,\n 'compelling': 1,\n 'benefits': 1}"
  143. ]
  144. },
  145. "execution_count": 91,
  146. "metadata": {},
  147. "output_type": "execute_result"
  148. }
  149. ],
  150. "source": [
  151. "# Now let's count the unique words!\n",
  152. "\n",
  153. "counts = {}\n",
  154. "\n",
  155. "for w in words:\n",
  156. " if counts.get(w):\n",
  157. " counts[w] += 1\n",
  158. " else:\n",
  159. " counts[w] = 1\n",
  160. "\n",
  161. "counts"
  162. ]
  163. }
  164. ],
  165. "metadata": {
  166. "kernelspec": {
  167. "display_name": "Python 3",
  168. "name": "python3"
  169. },
  170. "name": "Untitled1.ipynb"
  171. },
  172. "nbformat": 4,
  173. "nbformat_minor": 2
  174. }
Add Comment
Please, Sign In to add comment