Redshift动手实验(一)

1、创建表结构

执行如下命令,在Public Schema下创建表。

create table users(
    userid integer not null distkey sortkey,
    username char(8),
    firstname varchar(30),
    lastname varchar(30),
    city varchar(30),
    state char(2),
    email varchar(100),
    phone char(14),
    likesports boolean,
    liketheatre boolean,
    likeconcerts boolean,
    likejazz boolean,
    likeclassical boolean,
    likeopera boolean,
    likerock boolean,
    likevegas boolean,
    likebroadway boolean,
    likemusicals boolean);

create table venue(
    venueid smallint not null distkey sortkey,
    venuename varchar(100),
    venuecity varchar(30),
    venuestate char(2),
    venueseats integer);

create table category(
    catid smallint not null distkey sortkey,
    catgroup varchar(10),
    catname varchar(10),
    catdesc varchar(50));

create table date(
    dateid smallint not null distkey sortkey,
    caldate date not null,
    day character(3) not null,
    week smallint not null,
    month character(5) not null,
    qtr character(5) not null,
    year smallint not null,
    holiday boolean default('N'));

create table event(
    eventid integer not null distkey,
    venueid smallint not null,
    catid smallint not null,
    dateid smallint not null sortkey,
    eventname varchar(200),
    starttime timestamp);

create table listing(
    listid integer not null distkey,
    sellerid integer not null,
    eventid integer not null,
    dateid smallint not null  sortkey,
    numtickets smallint not null,
    priceperticket decimal(8,2),
    totalprice decimal(8,2),
    listtime timestamp);

create table sales(
    salesid integer not null,
    listid integer not null distkey,
    sellerid integer not null,
    buyerid integer not null,
    eventid integer not null,
    dateid smallint not null sortkey,
    qtysold smallint not null,
    pricepaid decimal(8,2),
    commission decimal(8,2),
    saletime timestamp);

注意执行commit提交生效。

2、从S3加载数据

先创建好Redshift专用的Role,在Role中,挂在S3 Readonly的Policy。然后修改集群,把Role挂到Redshift上。注意S3匹配区域。此外,还需要这个role的ARN,复制下来替换下文的参数。

执行如下命令。

copy users from 's3://redshift-demo-lxy/demo1/allusers_pipe.txt' 
credentials 'aws_iam_role=arn:aws-cn:iam::420029960748:role/lxy-redshift' 
delimiter '|' region 'cn-northwest-1';

copy venue from 's3://redshift-demo-lxy/demo1/venue_pipe.txt' 
credentials 'aws_iam_role=arn:aws-cn:iam::420029960748:role/lxy-redshift' 
delimiter '|' region 'cn-northwest-1';

copy category from 's3://redshift-demo-lxy/demo1/category_pipe.txt' 
credentials 'aws_iam_role=arn:aws-cn:iam::420029960748:role/lxy-redshift' 
delimiter '|' region 'cn-northwest-1';

copy date from 's3://redshift-demo-lxy/demo1/date2008_pipe.txt' 
credentials 'aws_iam_role=arn:aws-cn:iam::420029960748:role/lxy-redshift' 
delimiter '|' region 'cn-northwest-1';

copy event from 's3://redshift-demo-lxy/demo1/allevents_pipe.txt' 
credentials 'aws_iam_role=arn:aws-cn:iam::420029960748:role/lxy-redshift' 
delimiter '|' timeformat 'YYYY-MM-DD HH:MI:SS' region 'cn-northwest-1';

copy listing from 's3://redshift-demo-lxy/demo1/listings_pipe.txt' 
credentials 'aws_iam_role=arn:aws-cn:iam::420029960748:role/lxy-redshift' 
delimiter '|' region 'cn-northwest-1';

copy sales from 's3://redshift-demo-lxy/demo1/sales_tab.txt'
credentials 'aws_iam_role=arn:aws-cn:iam::420029960748:role/lxy-redshift'
delimiter '\t' timeformat 'MM/DD/YYYY HH:MI:SS' region 'cn-northwest-1';

注意执行commit提交生效。

3、确认加载数据成功

执行如下命令校验刚才从S3导入数据的各表的行数。

select count(*) from users;
select count(*) from venue;
select count(*) from category;
select count(*) from date;
select count(*) from event;
select count(*) from listing;
select count(*) from sales;

4、查询测试

运行如下测试,分别哈讯不同的结果。

-- Query 1 Find total sales on a given calendar date.
-- 查找特定日期的交易总数
--
SELECT sum(qtysold) 
FROM   sales, date 
WHERE  sales.dateid = date.dateid 
AND    caldate = '2008-01-05';

-- Find top 10 buyers by quantity.
-- 查找按购买数量的前十的买家
SELECT firstname, lastname, total_quantity 
FROM   (SELECT buyerid, sum(qtysold) total_quantity
        FROM  sales
        GROUP BY buyerid
        ORDER BY total_quantity desc limit 10) Q, users
WHERE Q.buyerid = userid
ORDER BY Q.total_quantity desc;

-- Query 3 Find events in the 99.9 percentile in terms of all time gross sales.
-- 查询所有活动销量销售总额的排名在前1000分之1的活动名称和销售额。
SELECT eventname, total_price 
FROM  (SELECT eventid, total_price, ntile(1000) over(order by total_price desc) as percentile 
       FROM (SELECT eventid, sum(pricepaid) total_price
             FROM   sales
             GROUP BY eventid)) Q, event E
       WHERE Q.eventid = E.eventid
       AND percentile = 1 OR percentile = 2
ORDER BY total_price desc;

5、实验完毕清除资源

执行如下命令清除数据库。

drop table part users;
drop table venue cascade;
drop table category cascade;
drop table date cascade;
drop table event cascade;
drop table listing cascade;
drop table sales cascade;

6、参考资料

参考网址:

https://docs.aws.amazon.com/zh_cn/redshift/latest/gsg/getting-started.html