Advertisement
Guest User

Untitled

a guest
Feb 5th, 2016
56
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.78 KB | None | 0 0
  1. from pyspark import SparkContext
  2. from pyspark.sql import SQLContext
  3. from pyspark.sql.types import *
  4.  
  5.  
  6. def doWork(rec):
  7. data = SQLContext.sql("select * from zip_data where STATEFP ='{sfp}' and COUNTYFP = '{cfp}' ".format(sfp=rec[0], cfp=rec[1]))
  8. for item in data.collect():
  9. print(item)
  10. # do something
  11. return (rec[0], rec[1])
  12.  
  13. if __name__ == "__main__":
  14. sc = SparkContext(appName="Zip Disaggregation")
  15. print("Starting Zip Disaggregation")
  16.  
  17. SQLContext = SQLContext(sc)
  18.  
  19. parquetFile = SQLContext.read.parquet("/path/to/data/")
  20. parquetFile.registerTempTable("zip_data")
  21.  
  22.  
  23. df = SQLContext.sql("select distinct STATEFP,COUNTYFP from zip_data where STATEFP IN ('12') ")
  24. rslts = df.map(doWork)
  25.  
  26. for rslt in rslts.collect():
  27. print(rslt)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement