> Linux集群 > Hadoop >

hive操作总结

1 将排序结果插入到新文件中
hive> insert overwrite table re_table1 select * from table1 cluster by id;

2 在shell中使用hive
$HIVE_HOME/bin/hive -S -e "select * from table1 cluster by id" > /home/hadoop/hadoop/hadoop-1.2.1/test/re_s.txt

3 hive中操作hadoop命令
hive> dfs -ls /data;

4 hive> select * from table1 cluster by school limit 10;
不能写成hive> select * from table1 limit 10 cluster by school;

5 hive中打印出字段名
 a 可以直接在hive中添加命令:set hive.cli.print.header=true
 b 在$HOME/.hiverc文件中添加配置:hive.cli.print.header=true(这种形式使用hive会报错并退出)

6 hive去重
hive> select id, concat_ws(',', collect_set(school)) from table1 group by id;
在这里CONCAT_WS是UDF,COLLECT_SET是UDAF,它将group后的id去重,并转换为了array,方便udf使用。

7 查看数据库信息
hive> describe database default;

8 显示是哪个数据库
hive> set hive.cli.print.current.db=true;

9 hive执行按严格格式进行的设置
hive> set hive.mapred.mode=strict

10 创建外部分区表(创建表前,外部路径文件夹下不能有文件,或者不能创建外部文件夹,需要先创建表再创建外部文件夹)

hive> create external table table32(id int, name string, ip string)                             
    > comment 'table32'                                            
    > partitioned by (city string, country string)                 
    > row format delimited fields terminated by '\t'               
    > location '/data/table32';   
hive> select * from table321 where city='beijing' and country='China';

将table321中筛选结果插入到table32中,table321无partition结构,table32有partition结构
hive> insert overwrite table table32                                           
    > partition (city='beijing', country='China')                              
    > select id,name,ip from table321 t where t.city='beijing' and t.country='China';

11 多个partition选择写入多个分区
hive> from table321 t                            
    > insert overwrite table table32                                
    > partition (city="shanxi", country="China")
    > select id, name, ip where t.city="shanxi" and country="China"
    > insert overwrite table table32                               
    > partition (city="shanghai", country="China")                   
    > select id, name, ip where t.city="shanghai" and country="China"
    > insert overwrite table table32                                 
    > partition (city="beijing", country="China")                    
    > select id, name, ip where t.city="beijing" and country="China";

12 在shell中使用-f运行hive多命令
建立test1.sql文件如下:
use test; 注意这是一个hive语句所以要加;
from table321 t
insert overwrite table table32
partition (city="shanxi", country="China")
select id, name, ip where t.city="shanxi" and country="China"
insert overwrite table table32
partition (city="shanghai", country="China")
select id, name, ip where t.city="shanghai" and country="China"
insert overwrite table table32
partition (city="beijing", country="China")
select id, name, ip where t.city="beijing" and country="China"; 注意要加;

建立test1.sh文件如下:
#!/bin/bash
#tablename="table1"
#limitcount="100"
#/usr/hive/hive-0.11.0/bin/hive -S -e "insert table re_table1 select * from ${tablename} cluster by id"
#echo $HIVE_HOME
#$HIVE_HOME/bin/hive -S -e "select * from table1 cluster by id" > /home/hadoop/hadoop/hadoop-1.2.1/test/re_s.txt
echo "hive start!"
$HIVE_HOME/bin/hive -S -f /home/shell/test1.sql
echo "hive ok!"

13 多表连接查询
导入数据到hdfs
[root@centos2 hadoop-1.2.1]# bin/hadoop fs -put /home/hadoop/hadoop/hadoop-1.2.1/test/table351.txt /data/table/table351
[root@centos2 hadoop-1.2.1]# bin/hadoop fs -put /home/hadoop/hadoop/hadoop-1.2.1/test/table352.txt /data/table/table352
[root@centos2 hadoop-1.2.1]# bin/hadoop fs -put /home/hadoop/hadoop/hadoop-1.2.1/test/table353.txt /data/table/table353

hive> select * from table351 a join table352 b on a.id=b.id;输出a和b表的数据,b的数据添加在a的后面
hive> select a.* from table351 a join table352 b on a.id=b.id;只显示a表的数据
hive> select a.* from table351 a join table352 b on a.id=b.id
    > join table353 c on c.id=a.id;   

14 hadoop/hive
(责任编辑:IT)