Untitled

[cloudera@quickstart Desktop]$ spark-shell
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel).
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 1.6.0
      /_/

Using Scala version 2.10.5 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_67)
Type in expressions to have them evaluated.
Type :help for more information.
Spark context available as sc (master = yarn-client, app id = application_1576096912396_0008).
19/12/11 17:02:41 WARN metastore.ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 1.1.0-cdh5.13.0
19/12/11 17:02:41 WARN metastore.ObjectStore: Failed to get database default, returning NoSuchObjectException
SQL context available as sqlContext.

scala> sqlContext.sql("CREATE TABLE sample_07 (code string,description string,total_emp int,salary int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TextFile")
res0: org.apache.spark.sql.DataFrame = [result: string]

scala> sqlContext.sql("CREATE TABLE sample_08 (code string,description string,total_emp int,salary int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TextFile")
res1: org.apache.spark.sql.DataFrame = [result: string]

scala> sqlContext.sql("LOAD DATA INPATH '/user/cloudera/mydata/sample_07.csv' OVERWRITE INTO TABLE sample_07")
chgrp: changing ownership of 'hdfs://quickstart.cloudera:8020/user/hive/warehouse/sample_07/sample_07.csv': User does not belong to supergroup
res2: org.apache.spark.sql.DataFrame = [result: string]

scala> sqlContext.sql("LOAD DATA INPATH '/user/cloudera/mydata/sample_08.csv' OVERWRITE INTO TABLE sample_08")
chgrp: changing ownership of 'hdfs://quickstart.cloudera:8020/user/hive/warehouse/sample_08/sample_08.csv': User does not belong to supergroup
res3: org.apache.spark.sql.DataFrame = [result: string]

scala> val df_07 = sqlContext.sql("SELECT * from sample_07")
df_07: org.apache.spark.sql.DataFrame = [code: string, description: string, total_emp: int, salary: int]

scala> val df_08 = sqlContext.sql("SELECT * from sample_08")
df_08: org.apache.spark.sql.DataFrame = [code: string, description: string, total_emp: int, salary: int]

scala> df_07.filter(df_07("salary") > 150000).show()
+-------+--------------------+---------+------+
|   code|         description|total_emp|salary|
+-------+--------------------+---------+------+
|11-1011|    Chief executives|   299160|151370|
|29-1022|Oral and maxillof...|     5040|178440|
|29-1023|       Orthodontists|     5350|185340|
|29-1024|     Prosthodontists|      380|169360|
|29-1061|   Anesthesiologists|    31030|192780|
|29-1062|Family and genera...|   113250|153640|
|29-1063| Internists, general|    46260|167270|
|29-1064|Obstetricians and...|    21340|183600|
|29-1067|            Surgeons|    50260|191410|
|29-1069|Physicians and su...|   237400|155150|
+-------+--------------------+---------+------+


scala> df_07.printShema()
<console>:28: error: value printShema is not a member of org.apache.spark.sql.DataFrame
              df_07.printShema()
                    ^

scala> val df_09 = df_07.join(df_08, df_07("code") === df_08("code")).select(df_07.col("code"), df_07.col("description"))
df_09: org.apache.spark.sql.DataFrame = [code: string, description: string]

scala> df_09.show()
+-------+--------------------+
|   code|         description|
+-------+--------------------+
|00-0000|     All Occupations|
|11-0000|Management occupa...|
|11-1011|    Chief executives|
|11-1021|General and opera...|
|11-1031|         Legislators|
|11-2011|Advertising and p...|
|11-2021|  Marketing managers|
|11-2022|      Sales managers|
|11-2031|Public relations ...|
|11-3011|Administrative se...|
|11-3021|Computer and info...|
|11-3031|  Financial managers|
|11-3041|Compensation and ...|
|11-3042|Training and deve...|
|11-3049|Human resources m...|
|11-3051|Industrial produc...|
|11-3061| Purchasing managers|
|11-3071|Transportation, s...|
|11-9011|Farm, ranch, and ...|
|11-9012|Farmers and ranchers|
+-------+--------------------+
only showing top 20 rows


scala> val df_10 = sqlContext.sql("select s7.code, s7.description from sample_07 s7 join sample_08 s8 on s7.code == s8.code")
df_10: org.apache.spark.sql.DataFrame = [code: string, description: string]

scala> df_10.show()
+-------+--------------------+
|   code|         description|
+-------+--------------------+
|00-0000|     All Occupations|
|11-0000|Management occupa...|
|11-1011|    Chief executives|
|11-1021|General and opera...|
|11-1031|         Legislators|
|11-2011|Advertising and p...|
|11-2021|  Marketing managers|
|11-2022|      Sales managers|
|11-2031|Public relations ...|
|11-3011|Administrative se...|
|11-3021|Computer and info...|
|11-3031|  Financial managers|
|11-3041|Compensation and ...|
|11-3042|Training and deve...|
|11-3049|Human resources m...|
|11-3051|Industrial produc...|
|11-3061| Purchasing managers|
|11-3071|Transportation, s...|
|11-9011|Farm, ranch, and ...|
|11-9012|Farmers and ranchers|
+-------+--------------------+
only showing top 20 rows


scala> df_10.registerTempTable("sparkle_my_ass")

scala> sqlContext.sql("select * from sparkle_my_ass").show()
+-------+--------------------+
|   code|         description|
+-------+--------------------+
|00-0000|     All Occupations|
|11-0000|Management occupa...|
|11-1011|    Chief executives|
|11-1021|General and opera...|
|11-1031|         Legislators|
|11-2011|Advertising and p...|
|11-2021|  Marketing managers|
|11-2022|      Sales managers|
|11-2031|Public relations ...|
|11-3011|Administrative se...|
|11-3021|Computer and info...|
|11-3031|  Financial managers|
|11-3041|Compensation and ...|
|11-3042|Training and deve...|
|11-3049|Human resources m...|
|11-3051|Industrial produc...|
|11-3061| Purchasing managers|
|11-3071|Transportation, s...|
|11-9011|Farm, ranch, and ...|
|11-9012|Farmers and ranchers|
+-------+--------------------+
only showing top 20 rows


scala> df_10.persist()
res10: df_10.type = [code: string, description: string]

scala> sqlContext.sql("select * from sparkle_my_ass").show()
+-------+--------------------+
|   code|         description|
+-------+--------------------+
|00-0000|     All Occupations|
|11-0000|Management occupa...|
|11-1011|    Chief executives|
|11-1021|General and opera...|
|11-1031|         Legislators|
|11-2011|Advertising and p...|
|11-2021|  Marketing managers|
|11-2022|      Sales managers|
|11-2031|Public relations ...|
|11-3011|Administrative se...|
|11-3021|Computer and info...|
|11-3031|  Financial managers|
|11-3041|Compensation and ...|
|11-3042|Training and deve...|
|11-3049|Human resources m...|
|11-3051|Industrial produc...|
|11-3061| Purchasing managers|
|11-3071|Transportation, s...|
|11-9011|Farm, ranch, and ...|
|11-9012|Farmers and ranchers|
+-------+--------------------+
only showing top 20 rows


scala>