Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "from pyspark.sql import functions as F\n",
- "from pyspark.sql import types as T"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "df = sqlContext.read.parquet(\"/guoda/data/idigbio-20170708T023306.parquet\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "ic_cc = df.groupby([\"institutioncode\",\"collectioncode\"]).count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "128926"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "ic_cc.count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "3573"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "ic_cc[F.col(\"count\") > 1].count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "h = ic_cc.select(\"count\").rdd.flatMap(lambda x: x).histogram(list(range(1,10)) + [10**x for x in range(1,7)])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 1000, 10000, 100000, 1000000],\n",
- " [125353, 453, 154, 109, 94, 53, 48, 45, 31, 656, 547, 588, 572, 213])"
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "h"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "1 - PySpark Python3 Small",
- "language": "python",
- "name": "pyspark3-small"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.5.2"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 1
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement