以Postgresql为主,再聊聊资料库应用递迴加快 count distinct 的等效查询-58码农网

create table it201011a (  gal text not null);-- 输入一亿笔insert into it201011aselect (array['小岛南', '初川南', '相沢南', '架乃ゆら', '山岸逢花', '枫カレン', '坂道みる', '桥本ありな', '葵つかさ', '天使もえ', '水卜さくら'])[floor(random() * 11) + 1::int]  from generate_series(1, 1e8);create index on it201011a(gal);-- 要找出栏位distinct , 一般直接的方式.select distinct gal  from it201011a;这样会很慢....我们只看一下 评估成本,就不用 timing.explain (costs)select distinct gal  from it201011a;+------------------------------------------------------------------------------+|                                  QUERY PLAN                                  |+------------------------------------------------------------------------------+| HashAggregate  (cost=1790541.00..1790541.11 rows=11 width=12)                ||   Group Key: gal                                                             ||   ->  Seq Scan on it201011a  (cost=0.00..1540541.00 rows=100000000 width=12) |+------------------------------------------------------------------------------+--with recursive t1 as (select min(t.gal) gal  from it201011a t where t.gal is not nullunion allselect (select min(gal)           from it201011a a        where a.gal > b.gal          and a.gal is not null)  from t1 b where b.gal is not null)select *  from t1 where gal is not null;+------------+|    gal     |+------------+| 初川南     || 小岛南     || 相沢南     || 坂道みる   || 天使もえ   || 山岸逢花   || 架乃ゆら   || 枫カレン   || 葵つかさ   || 桥本ありな || 水卜さくら |+------------+(11 rows)Time: 4.377 ms速度超快!! 来看一下执行计画explain (analyze,timing,costs)with recursive t1 as (select min(t.gal) gal  from it201011a t where t.gal is not nullunion allselect (select min(gal)           from it201011a a        where a.gal > b.gal          and a.gal is not null)  from t1 b where b.gal is not null)select *  from t1 where gal is not null; +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+|                                                                                       QUERY PLAN                                                                                       |+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+| CTE Scan on t1  (cost=241.33..243.35 rows=100 width=32) (actual time=0.147..3.688 rows=11 loops=1)                                                                                     ||   Filter: (gal IS NOT NULL)                                                                                                                                                            ||   Rows Removed by Filter: 1                                                                                                                                                            ||   CTE t1                                                                                                                                                                               ||     ->  Recursive Union  (cost=2.33..241.33 rows=101 width=32) (actual time=0.141..3.666 rows=12 loops=1)                                                                              ||           ->  Result  (cost=2.33..2.34 rows=1 width=32) (actual time=0.107..0.109 rows=1 loops=1)                                                                                      ||                 InitPlan 3 (returns $1)                                                                                                                                                ||                   ->  Limit  (cost=0.57..2.33 rows=1 width=12) (actual time=0.100..0.101 rows=1 loops=1)                                                                               ||                         ->  Index Only Scan using it201011a_gal_idx on it201011a t  (cost=0.57..176009683.02 rows=100000000 width=12) (actual time=0.098..0.099 rows=1 loops=1)        ||                               Index Cond: (gal IS NOT NULL)                                                                                                                            ||                               Heap Fetches: 1                                                                                                                                          ||           ->  WorkTable Scan on t1 b  (cost=0.00..23.70 rows=10 width=32) (actual time=0.291..0.292 rows=1 loops=12)                                                                   ||                 Filter: (gal IS NOT NULL)                                                                                                                                              ||                 Rows Removed by Filter: 0                                                                                                                                              ||                 SubPlan 2                                                                                                                                                              ||                   ->  Result  (cost=2.34..2.35 rows=1 width=32) (actual time=0.315..0.315 rows=1 loops=11)                                                                             ||                         InitPlan 1 (returns $3)                                                                                                                                        ||                           ->  Limit  (cost=0.57..2.34 rows=1 width=12) (actual time=0.313..0.313 rows=1 loops=11)                                                                      ||                                 ->  Index Only Scan using it201011a_gal_idx on it201011a a  (cost=0.57..59072943.18 rows=33333333 width=12) (actual time=0.312..0.312 rows=1 loops=11) ||                                       Index Cond: ((gal > b.gal) AND (gal IS NOT NULL))                                                                                                ||                                       Heap Fetches: 10                                                                                                                                 || Planning Time: 0.437 ms                                                                                                                                                                || Execution Time: 3.823 ms                                                                                                                                                               |+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+其实就一个loop,依次比较,且逐步减少需要聚合的量.-----------接着来看第二个例子. 有一种情境是 一个 table 存放 device 相关资料,device 可以是 sensor, 车辆 等等;另一个 table 存放 device 的纪录, 例如感应数据,或是行车状况等等.为了简便起见,以下的例子就不加上日期.create table itdevice ( id int not null primary key);insert into itdeviceselect generate_series(1,1000);create table itdevval (  id int generated always as identity, devid int not null, ts timestamp not null);输入600万笔资料insert into itdevval(devid, ts)select b.devid     , clock_timestamp()  from (select generate_series(1,1e4)) a     , (select generate_series(1,600) devid) b;建立 itdevval 中 devid 栏位的 index.create index on itdevval(devid);----当我们想要查询哪些device,没有数据资料,例如设备故障等情况.因为两边都有 device id 的 index, 一般直接查询.基本的 not inselect *  from itdevice where id not in (       select distinct devid         from itdevval);Time: 1372.260 ms (00:01.372)select *  from itdevice where id not in (       select devid         from itdevval);Time: 249832.184 ms (04:09.832)来看一下执行计画explain (costs)select *  from itdevice where id not in (       select distinct devid         from itdevval);+--------------------------------------------------------------------------------+|                                   QUERY PLAN                                   |+--------------------------------------------------------------------------------+| Seq Scan on itdevice  (cost=107440.50..107458.00 rows=500 width=4)             ||   Filter: (NOT (hashed SubPlan 1))                                             ||   SubPlan 1                                                                    ||     ->  HashAggregate  (cost=107433.00..107439.00 rows=600 width=4)            ||           Group Key: itdevval.devid                                            ||           ->  Seq Scan on itdevval  (cost=0.00..92433.00 rows=6000000 width=4) |+--------------------------------------------------------------------------------+explain (costs)select *  from itdevice where id not in (       select devid         from itdevval);+--------------------------------------------------------------------------------+|                                   QUERY PLAN                                   |+--------------------------------------------------------------------------------+| Seq Scan on itdevice  (cost=0.00..80435517.50 rows=500 width=4)                ||   Filter: (NOT (SubPlan 1))                                                    ||   SubPlan 1                                                                    ||     ->  Materialize  (cost=0.00..145871.00 rows=6000000 width=4)               ||           ->  Seq Scan on itdevval  (cost=0.00..92433.00 rows=6000000 width=4) |+--------------------------------------------------------------------------------+distinct 有聚合, HashAggregate  Group Key: itdevval.devid 但是都是 Seq Scan使用 outer join 的方式select a.id  from itdevice a  left join itdevval b    on (a.id = b.devid) where b.* is null;Time: 2218.486 ms (00:02.218)explain (costs)select a.id  from itdevice a  left join itdevval b    on (a.id = b.devid) where b.* is null;+---------------------------------------------------------------------------+|                                QUERY PLAN                                 |+---------------------------------------------------------------------------+| Hash Right Join  (cost=27.50..108277.25 rows=30000 width=4)               ||   Hash Cond: (b.devid = a.id)                                             ||   Filter: (b.* IS NULL)                                                   ||   ->  Seq Scan on itdevval b  (cost=0.00..92433.00 rows=6000000 width=44) ||   ->  Hash  (cost=15.00..15.00 rows=1000 width=4)                         ||         ->  Seq Scan on itdevice a  (cost=0.00..15.00 rows=1000 width=4)  |+---------------------------------------------------------------------------+递迴的方式with recursive t1 as (select min(devid) devid  from itdevval where devid is not nullunion allselect (select min(v.devid)          from itdevval v         where v.devid > t1.devid           and v.devid is not null)  from t1 where t1.devid is not null), t2 as (select devid  from t1 where devid is not null -- t2 是有 itdevval 有资料的 devid)select id  from itdevice where id not in (       select devid         from t2);         Time: 11.715 ms原理与第一个例子相同.速度比一般方法快多了.
给这篇文章的作者打赏

关于作者: 网站小编

相关文章

HBO Max vs.Netflix：当你负担不起两者时如何选择

课内笔记整理---作业系统实务(资安相关篇)

excel vba捞网页数据问题

热门文章

1以Postgresql为主,再聊聊资料库 应用递迴加快 count distinct 的等效查询

2Week33 - 利用Golang Server与Nats实作简单的Pub/Sub demo [Server的终局之战系

3鼠年全马铁人挑战 WEEK 34：负载性能测试 - Gatling (上)

4以Postgresql为主,再聊聊资料库 递迴 又见递迴

510大安全软体爆漏洞

1以Postgresql为主,再聊聊资料库应用递迴加快 count distinct 的等效查询

4以Postgresql为主,再聊聊资料库递迴又见递迴