hive操作总结

时间:2014-11-04 18:15 来源:linux.it.net.cn 作者:IT

1 将排序结果插入到新文件中
hive> insert overwrite table re_table1 select * from table1 cluster by id;

2 在shell中使用hive
$HIVE_HOME/bin/hive -S -e "select * from table1 cluster by id" > /home/hadoop/hadoop/hadoop-1.2.1/test/re_s.txt

3 hive中操作hadoop命令
hive> dfs -ls /data;

4 hive> select * from table1 cluster by school limit 10;
不能写成hive> select * from table1 limit 10 cluster by school;

5 hive中打印出字段名
a 可以直接在hive中添加命令：set hive.cli.print.header=true
b 在$HOME/.hiverc文件中添加配置：hive.cli.print.header=true（这种形式使用hive会报错并退出）

6 hive去重
hive> select id, concat_ws(',', collect_set(school)) from table1 group by id;
在这里CONCAT_WS是UDF，COLLECT_SET是UDAF，它将group后的id去重，并转换为了array，方便udf使用。

7 查看数据库信息
hive> describe database default;

8 显示是哪个数据库
hive> set hive.cli.print.current.db=true;

9 hive执行按严格格式进行的设置
hive> set hive.mapred.mode=strict

10 创建外部分区表（创建表前，外部路径文件夹下不能有文件，或者不能创建外部文件夹，需要先创建表再创建外部文件夹）

hive> create external table table32(id int, name string, ip string)
    > comment 'table32'
    > partitioned by (city string, country string)
    > row format delimited fields terminated by '\t'
    > location '/data/table32';
hive> select * from table321 where city='beijing' and country='China';

将table321中筛选结果插入到table32中，table321无partition结构，table32有partition结构
hive> insert overwrite table table32
    > partition (city='beijing', country='China')
    > select id,name,ip from table321 t where t.city='beijing' and t.country='China';

11 多个partition选择写入多个分区
hive> from table321 t
    > insert overwrite table table32
    > partition (city="shanxi", country="China")
    > select id, name, ip where t.city="shanxi" and country="China"
    > insert overwrite table table32
    > partition (city="shanghai", country="China")
    > select id, name, ip where t.city="shanghai" and country="China"
    > insert overwrite table table32
    > partition (city="beijing", country="China")
    > select id, name, ip where t.city="beijing" and country="China";

12 在shell中使用-f运行hive多命令
建立test1.sql文件如下：
use test; 注意这是一个hive语句所以要加;
from table321 t
insert overwrite table table32
partition (city="shanxi", country="China")
select id, name, ip where t.city="shanxi" and country="China"
insert overwrite table table32
partition (city="shanghai", country="China")
select id, name, ip where t.city="shanghai" and country="China"
insert overwrite table table32
partition (city="beijing", country="China")
select id, name, ip where t.city="beijing" and country="China"; 注意要加;

建立test1.sh文件如下：
#!/bin/bash
#tablename="table1"
#limitcount="100"
#/usr/hive/hive-0.11.0/bin/hive -S -e "insert table re_table1 select * from ${tablename} cluster by id"
#echo $HIVE_HOME
#$HIVE_HOME/bin/hive -S -e "select * from table1 cluster by id" > /home/hadoop/hadoop/hadoop-1.2.1/test/re_s.txt
echo "hive start!"
$HIVE_HOME/bin/hive -S -f /home/shell/test1.sql
echo "hive ok!"

13 多表连接查询
导入数据到hdfs
[root@centos2 hadoop-1.2.1]# bin/hadoop fs -put /home/hadoop/hadoop/hadoop-1.2.1/test/table351.txt /data/table/table351
[root@centos2 hadoop-1.2.1]# bin/hadoop fs -put /home/hadoop/hadoop/hadoop-1.2.1/test/table352.txt /data/table/table352
[root@centos2 hadoop-1.2.1]# bin/hadoop fs -put /home/hadoop/hadoop/hadoop-1.2.1/test/table353.txt /data/table/table353

hive> select * from table351 a join table352 b on a.id=b.id;输出a和b表的数据，b的数据添加在a的后面
hive> select a.* from table351 a join table352 b on a.id=b.id;只显示a表的数据
hive> select a.* from table351 a join table352 b on a.id=b.id
    > join table353 c on c.id=a.id;

14 hadoop/hive
(责任编辑：IT)