Hive Joins
- Supports only equality joins a.key=b.key
- Doesnât support a.key<b.key type of joins as of now
- Joins with Non-equality conditions are very difficult to express such conditions as a map/reduce job.
Two Tables Online_Retail_Customer
, Online_Retail_Invoice
Push the datasets onto HDFS
hadoop fs -copyFromLocal /home/hduser/datasets/Online_Retail_Sales_Data/Online_Retail_Customer.txt /Online_Retail_Customer
hadoop fs -copyFromLocal /home/hduser/datasets/Online_Retail_Sales_Data/Online_Retail_Invoice.txt /Online_Retail_Invoice
hadoop fs -ls /
Create Table Schema for Online_Retail_Customer on Hive
hive
CREATE TABLE Tbl_Online_Retail_Customer(uniq_idc string, InvoiceDate string, UnitPrice INT, CustomerID INT,Country string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' ESCAPED BY '\\' ;
Fill the table with data
LOAD DATA INPATH '/Online_Retail_Customer' INTO TABLE Tbl_Online_Retail_Customer;
Check the link http://localhost:50070/explorer.html#/user/hive/warehouse
Create Table Schema for Online_Retail_Invoice on Hive
CREATE TABLE Tbl_Online_Retail_Invoice(uniq_idi string, InvoiceNo string, StockCode string, Description string,Quantity INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' ESCAPED BY '\\' ;
Fill the table with data
LOAD DATA INPATH '/Online_Retail_Invoice' INTO TABLE Tbl_Online_Retail_Invoice;
Check the link http://localhost:50070/explorer.html#/user/hive/warehouse
Left join
Drop the table if it is already there.
Drop table Tbl_left_join;
CREATE TABLE Tbl_left_join as SELECT * FROM Tbl_Online_Retail_Customer t1 LEFT JOIN Tbl_Online_Retail_Invoice t2 ON t1.uniq_idc = t2.uniq_idi;
How many rows in the resultant table?
select count(*) from Tbl_left_join;
select count(*) from Tbl_Online_Retail_Customer;
select count(*) from Tbl_Online_Retail_Invoice;
Right join
Drop the table if it is already there.
Drop table Tbl_right_join;
CREATE TABLE Tbl_right_join as SELECT * FROM Tbl_Online_Retail_Customer t1 RIGHT JOIN Tbl_Online_Retail_Invoice t2 ON t1.uniq_idc = t2.uniq_idi;
How many rows in the resultant table?
select count(*) from Tbl_right_join;
select count(*) from Tbl_Online_Retail_Customer;
select count(*) from Tbl_Online_Retail_Invoice;
Inner Join
Drop table Tbl_inner_join;
CREATE TABLE Tbl_inner_join as SELECT * FROM Tbl_Online_Retail_Customer t1 JOIN Tbl_Online_Retail_Invoice t2 ON t1.uniq_idc = t2.uniq_idi;
How many rows in the resultant table?
select count(*) from Tbl_inner_join;
select count(*) from Tbl_Online_Retail_Customer;
select count(*) from Tbl_Online_Retail_Invoice;
Full join
Drop table Tbl_Full_outer_join;
CREATE TABLE Tbl_Full_outer_join as SELECT * FROM Tbl_Online_Retail_Customer t1 FULL JOIN Tbl_Online_Retail_Invoice t2 ON t1.uniq_idc = t2.uniq_idi;
How many rows in the resultant table?
select count(*) from Tbl_Full_outer_join;
select count(*) from Tbl_Online_Retail_Customer;
select count(*) from Tbl_Online_Retail_Invoice;
Further Reading
- Alter Table in hive
- Table Partition
- Hive Views
- Hive Indexes
- Hive Thrift server