Skip to main content

Order Analysis with Redshift SQL

Setup the environment

!mkdir -p ~/.aws
!pip install -qq psycopg2-binary awscli boto3 awswrangler s3fs
%%writefile ~/.aws/credentials
[default]
aws_access_key_id=
aws_secret_access_key=
%%writefile ~/.aws/config
[default]
region=us-east-1
output=json

Connect to the Redshift cluster

import boto3
import json

%reload_ext sql
def get_secret(secret_name):
region_name = "us-east-1"
session = boto3.session.Session()
client = session.client(
service_name='secretsmanager',
region_name=region_name)
get_secret_value_response = client.get_secret_value(SecretId=secret_name)
get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
return get_secret_value_response

db_credentials = get_secret(secret_name='wysde')

USERNAME = db_credentials["REDSHIFT_USERNAME"]
PASSWORD = db_credentials["REDSHIFT_PASSWORD"]
HOST = db_credentials["REDSHIFT_HOST"]
PORT = db_credentials["REDSHIFT_PORT"]
DBNAME = "dev"
CONN = f"postgresql://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/{DBNAME}"

%sql {CONN}
SCHEMA = "retail"
%sql CREATE SCHEMA IF NOT EXISTS {SCHEMA}
%sql SET search_path = {SCHEMA}

Copy Data from s3 into Redshift Tables

Create Database and Table for Redshift

%%sql
DROP TABLE IF EXISTS orders;
CREATE TABLE orders (
order_id INT PRIMARY KEY,
order_date DATETIME,
order_customer_id INT,
order_status VARCHAR(30)
);
%sql SELECT * FROM sys_load_error_detail LIMIT 10;

Run Copy Command to copy from s3 to Redshift table

REDSHIFT_IAM_ROLE = 'arn:aws:iam::684199068947:role/service-role/AmazonRedshift-CommandsAccessRole-20220921T223853'
%sql COPY orders FROM 's3://wysde2/retail_db/orders/part_0000.csv' iam_role '{REDSHIFT_IAM_ROLE}' DATEFORMAT 'auto' CSV;

Validate using queries against Redshift Table

%sql SELECT * FROM orders LIMIT 10;
order_idorder_dateorder_customer_idorder_status
142013-07-25 00:00:009842PROCESSING
1302013-07-26 00:00:007509PENDING_PAYMENT
2572013-07-26 00:00:003273PROCESSING
5712013-07-28 00:00:005006PENDING
5722013-07-28 00:00:006691PENDING
6442013-07-28 00:00:007295PENDING_PAYMENT
7732013-07-29 00:00:0010414PENDING_PAYMENT
7892013-07-29 00:00:0017COMPLETE
9002013-07-30 00:00:008585CLOSED
9502013-07-30 00:00:004151PENDING_PAYMENT
%sql SELECT count(1) FROM orders;
count
68883

Copy JSON Data from s3 to Redshift table using IAM Role

%%sql
DROP TABLE IF EXISTS order_items;

CREATE TABLE order_items (
order_item_id INT PRIMARY KEY,
order_item_order_id INT,
order_item_product_id INT,
order_item_quantity INT,
order_item_subtotal FLOAT,
order_item_product_price FLOAT
);
%sql COPY order_items FROM 's3://wysde2/retail_db/retail_db_json/order_items' iam_role '{REDSHIFT_IAM_ROLE}' JSON AS 'auto';
%sql SELECT * FROM order_items LIMIT 10;
order_item_idorder_item_order_idorder_item_product_idorder_item_quantityorder_item_subtotalorder_item_product_price
119571299.98299.98
612010144199.9249.98
148611914399.9699.99
4341804031129.99129.99
504207191199.9999.99
5222141912199.9899.99
590239627279.9839.99
752300116144.9944.99
8773533653179.9759.99
946381810599.9519.99
%sql SELECT count(*) FROM order_items;
count
172198

Basic SQL Queries using AWS Redshift SQL

Filtering Data using AWS Redshift

%%sql
SELECT *
FROM orders
WHERE order_status = 'COMPLETE'
LIMIT 10;
order_idorder_dateorder_customer_idorder_status
2482013-07-26 00:00:0011707COMPLETE
5162013-07-28 00:00:009204COMPLETE
6592013-07-28 00:00:006006COMPLETE
7362013-07-29 00:00:008536COMPLETE
7702013-07-29 00:00:0012146COMPLETE
13212013-08-01 00:00:00800COMPLETE
14522013-08-01 00:00:004953COMPLETE
15992013-08-02 00:00:008647COMPLETE
18222013-08-03 00:00:005682COMPLETE
580752013-08-06 00:00:001136COMPLETE
%%sql
SELECT *
FROM orders
WHERE order_status = 'COMPLETE'
AND order_date = '2014-01-01 00:00:00.0'
LIMIT 10;
order_idorder_dateorder_customer_idorder_status
258822014-01-01 00:00:004598COMPLETE
258952014-01-01 00:00:001044COMPLETE
258882014-01-01 00:00:006735COMPLETE
259012014-01-01 00:00:003099COMPLETE
259202014-01-01 00:00:0012232COMPLETE
259212014-01-01 00:00:0012373COMPLETE
259362014-01-01 00:00:003057COMPLETE
259622014-01-01 00:00:005170COMPLETE
259802014-01-01 00:00:00363COMPLETE
259242014-01-01 00:00:003826COMPLETE
%%sql
SELECT *
FROM orders
WHERE order_status = 'COMPLETE'
AND order_date LIKE '2014-01%'
LIMIT 10;
order_idorder_dateorder_customer_idorder_status
260322014-01-02 00:00:002424COMPLETE
265962014-01-05 00:00:006998COMPLETE
621022014-01-09 00:00:006176COMPLETE
278232014-01-13 00:00:003017COMPLETE
287322014-01-18 00:00:005654COMPLETE
289152014-01-19 00:00:00166COMPLETE
291472014-01-21 00:00:006118COMPLETE
296492014-01-24 00:00:008244COMPLETE
298582014-01-26 00:00:005670COMPLETE
301942014-01-28 00:00:009135COMPLETE
%%sql
SELECT *
FROM orders
WHERE order_status IN ('COMPLETE', 'CLOSED')
LIMIT 10;
order_idorder_dateorder_customer_idorder_status
802013-07-25 00:00:003007COMPLETE
7712013-07-29 00:00:009026COMPLETE
7822013-07-29 00:00:0012213COMPLETE
9722013-07-30 00:00:0010255CLOSED
580292013-08-04 00:00:002863COMPLETE
29502013-08-10 00:00:003757COMPLETE
33432013-08-12 00:00:0012289CLOSED
34012013-08-13 00:00:003038COMPLETE
582442013-08-13 00:00:002316CLOSED
34772013-08-14 00:00:006898COMPLETE
%%sql
SELECT *
FROM orders
WHERE order_status IN ('COMPLETE', 'CLOSED')
AND order_date LIKE '2014-01%'
LIMIT 10;
order_idorder_dateorder_customer_idorder_status
259902014-01-02 00:00:004870CLOSED
263572014-01-04 00:00:0011733COMPLETE
264602014-01-05 00:00:007582COMPLETE
269952014-01-09 00:00:007456COMPLETE
270662014-01-09 00:00:002994CLOSED
271882014-01-10 00:00:0012389COMPLETE
274642014-01-11 00:00:007274COMPLETE
279112014-01-13 00:00:008571COMPLETE
279492014-01-14 00:00:0011080CLOSED
687842014-01-14 00:00:0010349COMPLETE

Total Aggregations using AWS Redshift SQL

%%sql
WITH
sales AS (
SELECT 1 AS sale_id, 1 AS course_id, 10.99 AS sale_amount
UNION ALL
SELECT 2, 2, 9.99
UNION ALL
SELECT 3, 1, 10.99
UNION ALL
SELECT 4, 3, 12.99
UNION ALL
SELECT 5, 3, 12.99
UNION ALL
SELECT 6, 1, 9.99
UNION ALL
SELECT 7, 2, 10.99
UNION ALL
SELECT 8, 2, 9.99
UNION ALL
SELECT 9, 4, 11.99
UNION ALL
SELECT 10, 1, 13.99
)
SELECT * FROM sales;
sale_idcourse_idsale_amount
1110.99
229.99
3110.99
4312.99
5312.99
619.99
7210.99
829.99
9411.99
10113.99
%%sql
SELECT count(distinct order_date),
count(distinct order_status)
FROM orders;
countcount_1
3649
%%sql
SELECT *
FROM order_items
LIMIT 10;
order_item_idorder_item_order_idorder_item_product_idorder_item_quantityorder_item_subtotalorder_item_product_price
119571299.98299.98
612010144199.9249.98
148611914399.9699.99
4341804031129.99129.99
504207191199.9999.99
5222141912199.9899.99
590239627279.9839.99
752300116144.9944.99
8773533653179.9759.99
946381810599.9519.99
%%sql
SELECT round(sum(order_item_subtotal), 2)
FROM order_items
WHERE order_item_order_id = 2;
round
579.98
%%sql
SELECT count(*) item_count,
round(sum(order_item_subtotal), 2) total_revenue,
round(avg(order_item_subtotal), 2) avg_revenue
FROM order_items
WHERE order_item_order_id = 2;
item_counttotal_revenueavg_revenue
3579.98193.33