Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Hive Command:
- 1. Create Table:
- ========================
- CREATE TABLE player_runs(player_id INT, year_of_play STRING, runs_scored INT)
- COMMENT 'This is the player_run table'
- STORED AS TEXTFILE;
- 2. See all tables in a database:
- =========================
- Show tables;
- 3. See Schema of a table
- =========================
- describe player_runs
- 4. Alter table commands :
- =========================
- ALTER TABLE player_runs RENAME TO runs_of_player;
- ALTER TABLE runs_of_player ADD COLUMNS (balls_played INT COMMENT 'a new int column');
- 5. Drop Table
- =========================
- drop table runs_of_player;
- CREATE TABLE player_runs(player_id INT, year_of_play STRING, runs_scored INT, balls_played INT, country STRING)
- COMMENT 'This is the player_run table'
- ROW FORMAT DELIMITED
- FIELDS TERMINATED BY ','
- STORED AS TEXTFILE;
- LOAD DATA LOCAL INPATH '/home/hduser/runs.csv' INTO TABLE player_runs;
- select * from player_runs limit 10;
- 6. Create external table
- =========================
- CREATE EXTERNAL TABLE player_runs_p(player_id INT, year_of_play STRING, runs_scored INT, balls_played INT)
- COMMENT 'This is the staging player_runs table' PARTITIONED BY(country STRING)
- ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
- STORED AS TEXTFILE
- LOCATION '/input/runs';
- hadoop fs -mkdir /input/runs/country=India
- hadoop fs -put /home/hduser/runs_India.csv /input/runs/country=India/
- ALTER TABLE player_runs_p ADD PARTITION(country='India');
- 7. Loading data into Table
- =============================
- LOAD DATA LOCAL INPATH '/home/hduser/runs_US.csv' INTO TABLE player_runs_p PARTITION(country='US');
- LOAD DATA LOCAL INPATH '/home/hduser/runs_India.csv' INTO TABLE player_runs_p PARTITION(country='INDIA');
- FROM player_runs_p
- INSERT OVERWRITE TABLE player_runs
- SELECT player_id, year_of_play, runs_scored,balls_played,0
- where player_id=10;
- 8. Simple Select Query:
- ==================
- Select player_id, runs_scored from player_runs;
- select * from player_runs where player_id=10;
- 9. Group BY Query:
- ====================
- select player_id, sum(runs_scored) from player_runs group by player_id;
- 10 Join :
- ==================
- CREATE EXTERNAL TABLE players(player_id INT, name STRING)
- COMMENT 'This is the staging player table'
- ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
- STORED AS TEXTFILE
- LOCATION '/input/player';
- LOAD DATA LOCAL INPATH '/home/hduser/player.csv' INTO TABLE players;
- select * from players join player_runs on players.player_id=player_runs.player_id;
- select * from players full outer join player_runs on players.player_id=player_runs.player_id;
- select * from players left outer join player_runs on players.player_id=player_runs.player_id;
- 11 Multi Insert:
- ===========================
- CREATE TABLE player_total_runs(player_id INT, runs_scored INT)
- COMMENT 'This is the player_total_run table'
- STORED AS TEXTFILE;
- CREATE TABLE yearly_runs(year_of_play STRING, runs_scored INT)
- COMMENT 'This is the yearly_run table'
- STORED AS TEXTFILE;
- //Insert into two tables
- FROM player_runs
- INSERT OVERWRITE TABLE player_total_runs
- select player_id, sum(runs_scored)
- group by player_id
- INSERT OVERWRITE TABLE yearly_runs
- select year_of_play, sum(runs_scored)
- group by year_of_play;
- //Insert into one table and one HDFS file
- hadoop fs -mkdir /output/
- hadoop fs -mkdir /output/yearly_runs
- FROM player_runs
- INSERT OVERWRITE TABLE player_total_runs
- select player_id, sum(runs_scored)
- group by player_id
- INSERT OVERWRITE DIRECTORY '/output/yearly_runs'
- select year_of_play, sum(runs_scored)
- group by year_of_play;
- 12 Partioning + Distribute: (HIVE CONFIG ERRORS)
- ============================
- CREATE EXTERNAL TABLE player_runs_distribute(player_id INT, year_of_play STRING, runs_scored INT, balls_played INT)
- COMMENT 'This is the staging player_runs table' PARTITIONED BY(country STRING)
- ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
- STORED AS TEXTFILE
- LOCATION '/input/runs_distribute/';
- set hive.exec.dynamic.partition=true;
- set hive.exec.dynamic.partition.mode=nonstrict;
- FROM player_runs
- INSERT OVERWRITE TABLE player_runs_distribute PARTITION(country)
- SELECT player_id, year_of_play , runs_scored , balls_played, country
- DISTRIBUTE BY country;
- GETTING ERROR MESSAGE:
- Dynamic partition strict mode requires at least one static partition column. To turn this off set hive.exec.dynamic.partition.mode=nonstrict
- select * from player_runs_distribute where country='India' limit 10;
- 13 Partioning + Bucketing: (HIVE CONFIG ERRORS)
- ============================
- CREATE EXTERNAL TABLE player_runs_clustered(player_id INT, year_of_play STRING, runs_scored INT, balls_played INT)
- COMMENT 'This is the player_runs table' PARTITIONED BY(country STRING)
- clustered by (player_id) INTO 10 buckets
- ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
- LINES TERMINATED BY '\n'
- STORED AS TEXTFILE
- LOCATION '/input/runs_clustered/';
- LOAD DATA LOCAL INPATH "data/runs_extra.csv" INTO TABLE player_runs;
- set hive.exec.dynamic.partition=true;
- set hive.exec.dynamic.partition.mode=nonstrict;
- set hive.enforce.bucketing=true;
- FROM player_runs
- INSERT OVERWRITE TABLE player_runs_clustered PARTITION(country)
- SELECT player_id, year_of_play , runs_scored , balls_played, country
- DISTRIBUTE BY country;
- select avg(runs_scored) from player_runs_clustered TABLESAMPLE(BUCKET 1 OUT OF 10);
- 14. MapSide Join
- =======================
- select /*+ MAPJOIN(players)*/ * from players join player_runs on players.player_id=player_runs.player_id;
- select * from players join player_runs on players.player_id=player_runs.player_id;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement