Advertisement
moh_hassan

Read text file using spark and python

Nov 20th, 2016
361
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.00 KB | None | 0 0
  1. # http://spark.apache.org/docs/latest/sql-programming-guide.html
  2.  
  3. import sys
  4. try:
  5.     from pyspark import SparkContext
  6.     from pyspark import SparkConf
  7.     print ("Successfully imported Spark Modules")
  8.     sc = SparkContext('local')
  9. except ImportError as e:
  10.     print ("Can not import Spark Modules", e)
  11.     sys.exit(1)
  12.  
  13. # fname="I:/temp/data/f1.txt"
  14. logFile="C:/temp/2013-09-15.log"
  15.  
  16. from pyspark.sql import SQLContext
  17. sqlContext = SQLContext(sc)
  18. # df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(logFile)
  19. df = sqlContext.read.format('com.databricks.spark.csv').options(header='false',delimiter=' ').load(logFile)
  20. df.show()
  21.  
  22.  
  23.  
  24.  
  25. df.printSchema()
  26.  
  27. '''
  28. root
  29. |-- _c0: string (nullable = true)
  30. |-- _c1: string (nullable = true)
  31. |-- _c2: string (nullable = true)
  32. |-- _c3: string (nullable = true)
  33. |-- _c4: string (nullable = true)
  34. |-- _c5: string (nullable = true)
  35. |-- _c6: string (nullable = true)
  36. |-- _c7: string (nullable = true)
  37. |-- _c8: string (nullable = true)
  38. |-- _c9: string (nullable = true)
  39. |-- _c10: string (nullable = true)
  40. '''
  41.  
  42. # df.select("GoogleKnowlege_Occupation").show()
  43. # Select everybody, but increment the age by 1
  44. df.select(df['_c0'], df['_c2'] ).show()
  45.  
  46. # Select people older than 21
  47. # df.filter(df['age'] > 21).show()
  48.  
  49. # Count people by age
  50. # df.groupBy("age").count().show()
  51.  
  52.  
  53. # Running SQL Queries Programmatically
  54. # The sql function on a SparkSession enables applications to run SQL queries programmatically and returns the result as a DataFrame.
  55.  
  56. # Register the DataFrame as a SQL temporary view
  57. df.createOrReplaceTempView("people")
  58.  
  59. from pyspark.shell import spark
  60.  
  61. sqlDF = spark.sql("SELECT * FROM people")
  62. sqlDF.show()
  63.  
  64. exit(0)
  65.  
  66. '''
  67. Welcome to
  68.      ____              __
  69.     / __/__  ___ _____/ /__
  70.    _\ \/ _ \/ _ `/ __/  '_/
  71.   /__ / .__/\_,_/_/ /_/\_\  version 2.0.2
  72.      /_/
  73.  
  74. Using Python version 3.5.1 (v3.5.1:37a07cee5969, Dec  6 2015 01:38:48)
  75. SparkSession available as 'spark'.
  76.  
  77.  
  78.  
  79. '''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement