# 书本的查看记录
create table if not exists test.book(
`user_id` String COMMENT '用户id',
`book_id` String COMMENT '书本id',
`view_time` Int32 COMMENT '页面查看时间,单位秒',
`create_time` DateTime64(3, 'UTC') COMMENT '创建时间'
) ENGINE = MergeTree()
partition by toYYYYMM(create_time)
order by (create_time, book_id)
# 书本浏览记录的预聚合 AggregatingMergeTree, 这里使用的是物化视图,物化视图很多操作跟普通表相同
CREATE MATERIALIZED VIEW IF NOT EXISTS test.book_mv
engine = AggregatingMergeTree()
partition by day
order by (day, book_id)
as select toYYYYMMDD(create_time) as day,
book_id as book_id,
count() as visit,
sumState(view_time) as sum_view_time
from test.book
group by day, book_id
示例数据:
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐
│ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 |
│ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 |
│ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 |
└─────────────────────┴───────────┴──────────┴──────┴─────────┘
建表:
CREATE TABLE UAct
(
UserID UInt64,
PageViews UInt8,
Duration UInt8,
Sign Int8,
Version UInt8
)
ENGINE = VersionedCollapsingMergeTree(Sign, Version)
ORDER BY UserID;
插入数据:
INSERT INTO UAct VALUES (4324182021466249494, 5, 146, 1, 1);
INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1, 1),(4324182021466249494, 6, 185, 1, 2);
上面执行了两个INSERT语句,创建了两个不同的数据片段。如果使用一个INSERT语句,ClickHouse将创建一个数据片段,并且将永远不会执行任何合并。
查询数据:
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐
│ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 │
└─────────────────────┴───────────┴──────────┴──────┴─────────┘
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐
│ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 │
│ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 │
└─────────────────────┴───────────┴──────────┴──────┴─────────┘
通过两个INSERT语句,创建了两个数据片段。SELECT查询是在两个线程中执行的,我们得到了随机顺序的行。由于尚未合并数据片段,折叠还未发生。我们无法预测ClickHouse在何时执行数据片段的合并。
因此,我们需要使用聚合:
SELECT
UserID,
sum(PageViews * Sign) AS PageViews,
sum(Duration * Sign) AS Duration,
Version
FROM UAct
GROUP BY UserID, Version
HAVING sum(Sign) > 0;
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Version─┐
│ 4324182021466249494 │ 6 │ 185 │ 2 │
└─────────────────────┴───────────┴──────────┴─────────┘
如果不使用聚合,可以对FROM子句使用FINAL修饰符进行强制合并:
:) select * from UAct FINAL;
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐
│ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 │
└─────────────────────┴───────────┴──────────┴──────┴─────────┘
1 rows in set. Elapsed: 0.008 sec.
这种数据查询的方法非常低效,不要在大表中使用这种方法。
1. 列级TTL示例
(1)、创建带TTL的表:
DROP TABLE example_table;
CREATE TABLE example_table
(
d DateTime,
a Int TTL d + INTERVAL 1 MINUTE,
b String TTL d + INTERVAL 1 MINUTE,
c String
)
ENGINE = MergeTree
ORDER BY d;
(2)、插入数据:
insert into example_table values (now(), 1, 'value1', 'ccc1');
insert into example_table values(now(), 2, 'value2', 'ccc2');
给表的列添加TTL:
ALTER TABLE example_table
MODIFY COLUMN
c String TTL d + INTERVAL 1 DAY;
修改列的TTL:
ALTER TABLE example_table
MODIFY COLUMN
c String TTL d + INTERVAL 1 MONTH;
2. 表级TTL示例
(1)、创建表
drop table example_table ;
CREATE TABLE example_table
(
d DateTime,
a Int,
b String,
c String
)
ENGINE = MergeTree
ORDER BY d
TTL d + INTERVAL 1 MINUTE DELETE;
(2)、插入数据
insert into example_table values (now(), 1, 'value1', 'ccc1');
insert into example_table values(now(), 2, 'value2', 'ccc2');
等待1分钟后, 执行optimize操作。
optimize table example_table
// 1. 不指定分区
drop table test_partition_non;
create table test_partition_non(name String, timestamp DateTime) ENGINE=MergeTree() order by name;
insert into test_partition_non values ('nanjing', '2020-04-21 12:23:33');
// 2. 数值
drop table test_partition_numeric;
create table test_partition_numeric(id UInt64, timestamp DateTime) ENGINE=MergeTree() order by id partition by id;
insert into test_partition_numeric values (556, '2020-04-21 12:23:33');
// 3. 日期
drop table test_partition_date;
create table test_partition_date(date Date, timestamp DateTime) ENGINE=MergeTree() order by date partition by date;
insert into test_partition_date values ('2020-04-21', '2020-04-21 12:23:33');
// 4. 字符串
drop table test_partition_string;
create table test_partition_string(name String, timestamp DateTime) ENGINE=MergeTree() order by name partition by name;
insert into test_partition_string values ('hangzhou', '2020-04-21 12:23:33');
查看表:
SELECT partition, name, table, active FROM system.parts WHERE table like 'test_partition_%';
示例1:基于表
DROP TABLE IF EXISTS test_aggregates;
CREATE TABLE test_aggregates
(
d Date,
sumV AggregateFunction(sum, UInt64),
uniqV AggregateFunction(uniq, UInt64)
)
ENGINE = AggregatingMergeTree()
ORDER BY d;
INSERT INTO test_aggregates
SELECT
toDate('2020-06-01') AS d,
sumState(number) as sumV,
uniqState(number) AS uniqV
FROM
(
SELECT toUInt64(number%8) as number FROM system.numbers LIMIT 10
);
SELECT sumMerge(sumV), uniqMerge(uniqV) FROM test_aggregates;
示例2:物化视图的使用
drop table t_basic;
create table t_basic(key String, sign UInt8, userId String) ENGINE=MergeTree order by key;
drop table t_m_view;
CREATE MATERIALIZED VIEW t_m_view
ENGINE = AggregatingMergeTree() ORDER BY (key)
AS SELECT
key,
sumState(sign) AS sumSign,
uniqState(userId) AS uniqUsers
FROM t_basic
GROUP BY key;
插入3条记录:
insert into t_basic values('a', 1, '11'),
('a', 2, '22'),
('a', 3, '11');
select * from t_basic;
┌─key─┬─sign─┬─userId─┐
│ a │ 1 │ 11 │
│ a │ 2 │ 22 │
│ a │ 3 │ 11 │
└─────┴──────┴────────┘
select key, sumMerge(sumSign), uniqMerge(uniqUsers) from t_m_view group by key;
┌─key─┬─sumMerge(sumSign)─┬─uniqMerge(uniqUsers)─┐
│ a │ 6 │ 2 │
└─────┴───────────────────┴──────────────────────┘
================================================================================================
示例1: State行和Cancel行匹配示例
================================================================================================
验证: 如果"state"行与"cancel"行的数目匹配,且最后一行是"state"行,则保留第一个"cancel"行和最后一个"state"行。
建表语句:
CREATE TABLE UAct
(
UserID UInt64,
PageViews UInt8,
Duration UInt8,
Sign Int8
)
ENGINE = CollapsingMergeTree(Sign)
ORDER BY UserID
首先清空表:
truncate table UAct;
执行如下的数据插入语句:
INSERT INTO UAct VALUES (4324182021466249494, 1, 11, -1),(4324182021466249494, 2, 12, -1);
INSERT INTO UAct VALUES (4324182021466249494, 3, 13, 1);
INSERT INTO UAct VALUES (4324182021466249494, 4, 14, 1);
查看数据:
xiaochen001 :) select * from UAct;
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
│ 4324182021466249494 │ 1 │ 11 │ -1 │
│ 4324182021466249494 │ 2 │ 12 │ -1 │
└─────────────────────┴───────────┴──────────┴──────┘
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
│ 4324182021466249494 │ 3 │ 13 │ 1 │
└─────────────────────┴───────────┴──────────┴──────┘
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
│ 4324182021466249494 │ 4 │ 14 │ 1 │
└─────────────────────┴───────────┴──────────┴──────┘
4 rows in set. Elapsed: 0.011 sec.
执行计划外的片段合并操作:
optimize table UAct;
查看折叠后的数据:
xiaochen001 :) select * from UAct;
SELECT *
FROM UAct
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
│ 4324182021466249494 │ 1 │ 11 │ -1 │
│ 4324182021466249494 │ 4 │ 14 │ 1 │
└─────────────────────┴───────────┴──────────┴──────┘
2 rows in set. Elapsed: 0.005 sec.
从示例中可以观察到,最终的数据片段保留了第一个"cancel"行和最后一个"state"行。
其他场景,有兴趣的可以自行测试。
================================================================================================
示例2: 聚合查询
================================================================================================
示例数据:
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
│ 4324182021466249494 │ 5 │ 146 │ 1 │
│ 4324182021466249494 │ 5 │ 146 │ -1 │
│ 4324182021466249494 │ 6 │ 185 │ 1 │
└─────────────────────┴───────────┴──────────┴──────┘
建表:
DROP TABLE UAct;
CREATE TABLE UAct
(
UserID UInt64,
PageViews UInt8,
Duration UInt8,
Sign Int8
)
ENGINE = CollapsingMergeTree(Sign)
ORDER BY UserID;
插入数据:
INSERT INTO UAct VALUES (4324182021466249494, 5, 146, 1);
INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1),(4324182021466249494, 6, 185, 1);
上面执行了两个INSERT语句,创建了两个不同的数据片段。如果使用一个INSERT语句,ClickHouse将创建一个数据片段,并且将永远不会执行任何合并。
查询数据:
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
│ 4324182021466249494 │ 5 │ 146 │ 1 │
└─────────────────────┴───────────┴──────────┴──────┘
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
│ 4324182021466249494 │ 5 │ 146 │ -1 │
│ 4324182021466249494 │ 6 │ 185 │ 1 │
└─────────────────────┴───────────┴──────────┴──────┘
通过两个INSERT语句,创建了两个数据片段。SELECT查询是在两个线程中执行的,我们得到了随机顺序的行。由于尚未合并数据片段,折叠还未发生。我们无法预测ClickHouse在何时执行数据片段的合并。
因此,我们需要使用聚合:
SELECT
UserID,
sum(PageViews * Sign) AS PageViews,
sum(Duration * Sign) AS Duration
FROM UAct
GROUP BY UserID
HAVING sum(Sign) > 0
┌──────────────UserID─┬─PageViews─┬─Duration─┐
│ 4324182021466249494 │ 6 │ 185 │
└─────────────────────┴───────────┴──────────┘
如果不使用聚合,可以对FROM子句使用FINAL修饰符进行强制合并:
:) select * from UAct FINAL;
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
│ 4324182021466249494 │ 6 │ 185 │ 1 │
└─────────────────────┴───────────┴──────────┴──────┘
1 rows in set. Elapsed: 0.008 sec.
这种数据查询的方法非常低效,不要在大表中使用这种方法。
================================================================================================
示例3: Cancel状态行取反的聚合查询
================================================================================================
示例数据:
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
│ 4324182021466249494 │ 5 │ 146 │ 1 │
│ 4324182021466249494 │ -5 │ -146 │ -1 │
│ 4324182021466249494 │ 6 │ 185 │ 1 │
└─────────────────────┴───────────┴──────────┴──────┘
这个方法的核心思想是仅考虑关键字段,在"cancel"行中,可以指定负数,这些负数的值等于行的前一个版本的关键字段值的取反,这样在求和时就可以不使用Sign列。对于这种方法,必须更改PageViews和Duration字段的数据类型,从UIn8改成Int16。
建表语句:
CREATE TABLE UAct
(
UserID UInt64,
PageViews Int16,
Duration Int16,
Sign Int8
)
ENGINE = CollapsingMergeTree(Sign)
ORDER BY UserID
插入数据,并测试:
insert into UAct values(4324182021466249494, 5, 146, 1);
insert into UAct values(4324182021466249494, -5, -146, -1);
insert into UAct values(4324182021466249494, 6, 185, 1);
select * from UAct final;
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
│ 4324182021466249494 │ 6 │ 185 │ 1 │
└─────────────────────┴───────────┴──────────┴──────┘
使用聚合时,不需要使用Sign列:
SELECT
UserID,
sum(PageViews) AS PageViews,
sum(Duration) AS Duration
FROM UAct
GROUP BY UserID
┌──────────────UserID─┬─PageViews─┬─Duration─┐
│ 4324182021466249494 │ 6 │ 185 │
└─────────────────────┴───────────┴──────────┘
统计数量:
select count() FROM UAct
┌─count()─┐
│ 3 │
└─────────┘
手动执行计划外的合并:
optimize table UAct final;
select * FROM UAct
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
│ 4324182021466249494 │ 6 │ 185 │ 1 │
└─────────────────────┴───────────┴──────────┴──────┘
示例1: 根据排序键去重
1. 准备数据
建表:
DROP TABLE replacingMergeTreeDemo;
CREATE TABLE replacingMergeTreeDemo
(
UserID UInt32,
CounterID UInt32,
UserName String,
EventDate Date
) ENGINE = ReplacingMergeTree()
ORDER BY (UserID, CounterID)
PRIMARY KEY (UserID);
2. 插入数据
分三批次插入数据, 为了观察效果, 下面的三段脚本依次按顺序分别执行, 不要一起执行:
批次1:
insert into replacingMergeTreeDemo values(1,100,'xiaohe1','2020-04-21');
insert into replacingMergeTreeDemo values(2,100,'xiaohe2','2020-04-22');
insert into replacingMergeTreeDemo values(3,102,'xiaohe3','2020-04-23');
批次2:
insert into replacingMergeTreeDemo values(3,103,'xiaohe3','2020-04-23');
insert into replacingMergeTreeDemo values(2,101,'xiaohe2','2020-04-24');
insert into replacingMergeTreeDemo values(1,100,'xiaohe1','2020-04-20');
批次3:
insert into replacingMergeTreeDemo values(2,100,'xiaohe2','2020-04-23');
查看数据:
xiaochen :) select * from replacingMergeTreeDemo;
SELECT *
FROM replacingMergeTreeDemo
┌─UserID─┬─CounterID─┬─UserName─┬──EventDate─┐
│ 1 │ 100 │ xiaohe1 │ 2020-04-21 │
│ 2 │ 100 │ xiaohe2 │ 2020-04-22 │
│ 2 │ 101 │ xiaohe2 │ 2020-04-24 │
│ 3 │ 102 │ xiaohe3 │ 2020-04-23 │
│ 3 │ 103 │ xiaohe3 │ 2020-04-23 │
└────────┴───────────┴──────────┴────────────┘
┌─UserID─┬─CounterID─┬─UserName─┬──EventDate─┐
│ 1 │ 100 │ xiaohe1 │ 2020-04-20 │
└────────┴───────────┴──────────┴────────────┘
┌─UserID─┬─CounterID─┬─UserName─┬──EventDate─┐
│ 2 │ 100 │ xiaohe2 │ 2020-04-23 │
└────────┴───────────┴──────────┴────────────┘
7 rows in set. Elapsed: 0.010 sec.
3. 手工执行计划外合并
执行命令:optimize table replacingMergeTreeDemo
xiaochen :) optimize table replacingMergeTreeDemo;
OPTIMIZE TABLE replacingMergeTreeDemo
Ok.
0 rows in set. Elapsed: 0.005 sec.
4. 查看合并后的数据
xiaochen :) select * from replacingMergeTreeDemo;
SELECT *
FROM replacingMergeTreeDemo
┌─UserID─┬─CounterID─┬─UserName─┬──EventDate─┐
│ 1 │ 100 │ xiaohe1 │ 2020-04-20 │
│ 2 │ 100 │ xiaohe2 │ 2020-04-23 │
│ 2 │ 101 │ xiaohe2 │ 2020-04-24 │
│ 3 │ 102 │ xiaohe3 │ 2020-04-23 │
│ 3 │ 103 │ xiaohe3 │ 2020-04-23 │
└────────┴───────────┴──────────┴────────────┘
5 rows in set. Elapsed: 0.007 sec.
从这个示例可以发现, ReplacingMergeTree是根据排序项对数据去重的,而不是根据主键。
示例2: 使用版本参数的去重
在示例1中, 我们简单观察可以发现, 去重后,保留的记录是最后一条记录。
如果指定了版本,则保留版本列的值最大的记录。
1. 建表
使用EventDate列作为版本字段。
DROP TABLE replacingMergeTreeDemo;
CREATE TABLE replacingMergeTreeDemo
(
UserID UInt32,
CounterID UInt32,
UserName String,
EventDate Date
) ENGINE = ReplacingMergeTree(EventDate)
ORDER BY (UserID, CounterID)
PRIMARY KEY (UserID);
2. 插入数据
分三批次插入数据, 为了观察效果, 下面的三段脚本依次按顺序分别执行, 不要一起执行:
批次1:
insert into replacingMergeTreeDemo values(1,100,'xiaohe1','2020-06-21');
insert into replacingMergeTreeDemo values(2,100,'xiaohe2','2020-05-22');
insert into replacingMergeTreeDemo values(3,102,'xiaohe3','2020-04-23');
批次2:
insert into replacingMergeTreeDemo values(3,103,'xiaohe3','2020-04-23');
insert into replacingMergeTreeDemo values(2,101,'xiaohe2','2020-04-24');
insert into replacingMergeTreeDemo values(1,100,'xiaohe1','2020-04-20');
批次3:
insert into replacingMergeTreeDemo values(2,100,'xiaohe2','2020-04-23');
查看数据:
xiaochen :)
xiaochen :) select * from replacingMergeTreeDemo;
SELECT *
FROM replacingMergeTreeDemo
┌─UserID─┬─CounterID─┬─UserName─┬──EventDate─┐
│ 1 │ 100 │ xiaohe1 │ 2020-06-21 │
│ 2 │ 100 │ xiaohe2 │ 2020-05-22 │
│ 2 │ 101 │ xiaohe2 │ 2020-04-24 │
│ 3 │ 102 │ xiaohe3 │ 2020-04-23 │
│ 3 │ 103 │ xiaohe3 │ 2020-04-23 │
└────────┴───────────┴──────────┴────────────┘
┌─UserID─┬─CounterID─┬─UserName─┬──EventDate─┐
│ 1 │ 100 │ xiaohe1 │ 2020-04-20 │
└────────┴───────────┴──────────┴────────────┘
┌─UserID─┬─CounterID─┬─UserName─┬──EventDate─┐
│ 2 │ 100 │ xiaohe2 │ 2020-04-23 │
└────────┴───────────┴──────────┴────────────┘
7 rows in set. Elapsed: 0.011 sec.
3. 手工执行计划外合并
执行命令:optimize table replacingMergeTreeDemo
xiaochen :) optimize table replacingMergeTreeDemo;
OPTIMIZE TABLE replacingMergeTreeDemo
Ok.
0 rows in set. Elapsed: 0.005 sec.
4. 查看合并后的数据
xiaochen :) select * from replacingMergeTreeDemo;
SELECT *
FROM replacingMergeTreeDemo
┌─UserID─┬─CounterID─┬─UserName─┬──EventDate─┐
│ 1 │ 100 │ xiaohe1 │ 2020-06-21 │
│ 2 │ 100 │ xiaohe2 │ 2020-05-22 │
│ 2 │ 101 │ xiaohe2 │ 2020-04-24 │
│ 3 │ 102 │ xiaohe3 │ 2020-04-23 │
│ 3 │ 103 │ xiaohe3 │ 2020-04-23 │
└────────┴───────────┴──────────┴────────────┘
5 rows in set. Elapsed: 0.007 sec.
从这个示例可以发现, ReplacingMergeTree根据排序项对数据去重的, 保留版本字段列对应的最大值的记录。
drop table summtt;
CREATE TABLE summtt
(
key String,
name String,
value Int32
)
ENGINE = SummingMergeTree()
ORDER BY key;
insert into summtt values('a', 'xiaoa1', 1);
insert into summtt values('a', 'xiaoa2', 2);
insert into summtt values('b', 'xiaob1', 10);
insert into summtt values('b', 'xiaob2', 20);
insert into summtt values('c', 'xiaob1', 10);
insert into summtt values('c', 'xiaob2', -10);
SELECT * FROM summtt;
┌─key─┬─name───┬─value─┐
│ a │ xiaoa1 │ 3 │
│ b │ xiaob1 │ 30 │
│ c │ xiaob1 │ 10 │
└─────┴────────┴───────┘
┌─key─┬─name───┬─value─┐
│ c │ xiaob2 │ -10 │
└─────┴────────┴───────┘
执行计划外的合并:
optimize table summtt;
查看表数据:
select * from summtt;
┌─key─┬─name───┬─value─┐
│ a │ xiaoa1 │ 3 │
│ b │ xiaob1 │ 30 │
└─────┴────────┴───────┘
select key, sum(value) from summtt group by key;
全部评论