Advertisement
Guest User

Untitled

a guest
Jul 17th, 2017
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.63 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 7,
  6. "metadata": {
  7. "collapsed": true
  8. },
  9. "outputs": [],
  10. "source": [
  11. "from pyspark.sql import functions as F\n",
  12. "from pyspark.sql import types as T"
  13. ]
  14. },
  15. {
  16. "cell_type": "code",
  17. "execution_count": 2,
  18. "metadata": {
  19. "collapsed": false
  20. },
  21. "outputs": [],
  22. "source": [
  23. "df = sqlContext.read.parquet(\"/guoda/data/idigbio-20170708T023306.parquet\")"
  24. ]
  25. },
  26. {
  27. "cell_type": "code",
  28. "execution_count": 3,
  29. "metadata": {
  30. "collapsed": false
  31. },
  32. "outputs": [],
  33. "source": [
  34. "ic_cc = df.groupby([\"institutioncode\",\"collectioncode\"]).count()"
  35. ]
  36. },
  37. {
  38. "cell_type": "code",
  39. "execution_count": 10,
  40. "metadata": {
  41. "collapsed": false
  42. },
  43. "outputs": [
  44. {
  45. "data": {
  46. "text/plain": [
  47. "128926"
  48. ]
  49. },
  50. "execution_count": 10,
  51. "metadata": {},
  52. "output_type": "execute_result"
  53. }
  54. ],
  55. "source": [
  56. "ic_cc.count()"
  57. ]
  58. },
  59. {
  60. "cell_type": "code",
  61. "execution_count": 12,
  62. "metadata": {
  63. "collapsed": false
  64. },
  65. "outputs": [
  66. {
  67. "data": {
  68. "text/plain": [
  69. "3573"
  70. ]
  71. },
  72. "execution_count": 12,
  73. "metadata": {},
  74. "output_type": "execute_result"
  75. }
  76. ],
  77. "source": [
  78. "ic_cc[F.col(\"count\") > 1].count()"
  79. ]
  80. },
  81. {
  82. "cell_type": "code",
  83. "execution_count": 32,
  84. "metadata": {
  85. "collapsed": false
  86. },
  87. "outputs": [],
  88. "source": [
  89. "h = ic_cc.select(\"count\").rdd.flatMap(lambda x: x).histogram(list(range(1,10)) + [10**x for x in range(1,7)])"
  90. ]
  91. },
  92. {
  93. "cell_type": "code",
  94. "execution_count": 42,
  95. "metadata": {
  96. "collapsed": false
  97. },
  98. "outputs": [
  99. {
  100. "data": {
  101. "text/plain": [
  102. "([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 1000, 10000, 100000, 1000000],\n",
  103. " [125353, 453, 154, 109, 94, 53, 48, 45, 31, 656, 547, 588, 572, 213])"
  104. ]
  105. },
  106. "execution_count": 42,
  107. "metadata": {},
  108. "output_type": "execute_result"
  109. }
  110. ],
  111. "source": [
  112. "h"
  113. ]
  114. },
  115. {
  116. "cell_type": "code",
  117. "execution_count": null,
  118. "metadata": {
  119. "collapsed": true
  120. },
  121. "outputs": [],
  122. "source": []
  123. }
  124. ],
  125. "metadata": {
  126. "kernelspec": {
  127. "display_name": "1 - PySpark Python3 Small",
  128. "language": "python",
  129. "name": "pyspark3-small"
  130. },
  131. "language_info": {
  132. "codemirror_mode": {
  133. "name": "ipython",
  134. "version": 3
  135. },
  136. "file_extension": ".py",
  137. "mimetype": "text/x-python",
  138. "name": "python",
  139. "nbconvert_exporter": "python",
  140. "pygments_lexer": "ipython3",
  141. "version": "3.5.2"
  142. }
  143. },
  144. "nbformat": 4,
  145. "nbformat_minor": 1
  146. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement