Order Analysis with Redshift SQL
Setup the environment
!mkdir -p ~/.aws
!pip install -qq psycopg2-binary awscli boto3 awswrangler s3fs
%%writefile ~/.aws/credentials
[default]
aws_access_key_id=
aws_secret_access_key=
%%writefile ~/.aws/config
[default]
region=us-east-1
output=json
Connect to the Redshift cluster
import boto3
import json
%reload_ext sql
def get_secret(secret_name):
region_name = "us-east-1"
session = boto3.session.Session()
client = session.client(
service_name='secretsmanager',
region_name=region_name)
get_secret_value_response = client.get_secret_value(SecretId=secret_name)
get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
return get_secret_value_response
db_credentials = get_secret(secret_name='wysde')
USERNAME = db_credentials["REDSHIFT_USERNAME"]
PASSWORD = db_credentials["REDSHIFT_PASSWORD"]
HOST = db_credentials["REDSHIFT_HOST"]
PORT = db_credentials["REDSHIFT_PORT"]
DBNAME = "dev"
CONN = f"postgresql://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/{DBNAME}"
%sql {CONN}
SCHEMA = "retail"
%sql CREATE SCHEMA IF NOT EXISTS {SCHEMA}
%sql SET search_path = {SCHEMA}
Copy Data from s3 into Redshift Tables
Create Database and Table for Redshift
%%sql
DROP TABLE IF EXISTS orders;
CREATE TABLE orders (
order_id INT PRIMARY KEY,
order_date DATETIME,
order_customer_id INT,
order_status VARCHAR(30)
);
Troubleshoot Errors related to Redshift Copy Command
%sql SELECT * FROM sys_load_error_detail LIMIT 10;
Run Copy Command to copy from s3 to Redshift table
REDSHIFT_IAM_ROLE = 'arn:aws:iam::684199068947:role/service-role/AmazonRedshift-CommandsAccessRole-20220921T223853'
%sql COPY orders FROM 's3://wysde2/retail_db/orders/part_0000.csv' iam_role '{REDSHIFT_IAM_ROLE}' DATEFORMAT 'auto' CSV;
Validate using queries against Redshift Table
%sql SELECT * FROM orders LIMIT 10;
order_id | order_date | order_customer_id | order_status |
---|---|---|---|
14 | 2013-07-25 00:00:00 | 9842 | PROCESSING |
130 | 2013-07-26 00:00:00 | 7509 | PENDING_PAYMENT |
257 | 2013-07-26 00:00:00 | 3273 | PROCESSING |
571 | 2013-07-28 00:00:00 | 5006 | PENDING |
572 | 2013-07-28 00:00:00 | 6691 | PENDING |
644 | 2013-07-28 00:00:00 | 7295 | PENDING_PAYMENT |
773 | 2013-07-29 00:00:00 | 10414 | PENDING_PAYMENT |
789 | 2013-07-29 00:00:00 | 17 | COMPLETE |
900 | 2013-07-30 00:00:00 | 8585 | CLOSED |
950 | 2013-07-30 00:00:00 | 4151 | PENDING_PAYMENT |
%sql SELECT count(1) FROM orders;
count |
---|
68883 |
Copy JSON Data from s3 to Redshift table using IAM Role
%%sql
DROP TABLE IF EXISTS order_items;
CREATE TABLE order_items (
order_item_id INT PRIMARY KEY,
order_item_order_id INT,
order_item_product_id INT,
order_item_quantity INT,
order_item_subtotal FLOAT,
order_item_product_price FLOAT
);
%sql COPY order_items FROM 's3://wysde2/retail_db/retail_db_json/order_items' iam_role '{REDSHIFT_IAM_ROLE}' JSON AS 'auto';
%sql SELECT * FROM order_items LIMIT 10;
order_item_id | order_item_order_id | order_item_product_id | order_item_quantity | order_item_subtotal | order_item_product_price |
---|---|---|---|---|---|
1 | 1 | 957 | 1 | 299.98 | 299.98 |
61 | 20 | 1014 | 4 | 199.92 | 49.98 |
148 | 61 | 191 | 4 | 399.96 | 99.99 |
434 | 180 | 403 | 1 | 129.99 | 129.99 |
504 | 207 | 191 | 1 | 99.99 | 99.99 |
522 | 214 | 191 | 2 | 199.98 | 99.99 |
590 | 239 | 627 | 2 | 79.98 | 39.99 |
752 | 300 | 116 | 1 | 44.99 | 44.99 |
877 | 353 | 365 | 3 | 179.97 | 59.99 |
946 | 381 | 810 | 5 | 99.95 | 19.99 |
%sql SELECT count(*) FROM order_items;
count |
---|
172198 |
Basic SQL Queries using AWS Redshift SQL
Filtering Data using AWS Redshift
%%sql
SELECT *
FROM orders
WHERE order_status = 'COMPLETE'
LIMIT 10;
order_id | order_date | order_customer_id | order_status |
---|---|---|---|
248 | 2013-07-26 00:00:00 | 11707 | COMPLETE |
516 | 2013-07-28 00:00:00 | 9204 | COMPLETE |
659 | 2013-07-28 00:00:00 | 6006 | COMPLETE |
736 | 2013-07-29 00:00:00 | 8536 | COMPLETE |
770 | 2013-07-29 00:00:00 | 12146 | COMPLETE |
1321 | 2013-08-01 00:00:00 | 800 | COMPLETE |
1452 | 2013-08-01 00:00:00 | 4953 | COMPLETE |
1599 | 2013-08-02 00:00:00 | 8647 | COMPLETE |
1822 | 2013-08-03 00:00:00 | 5682 | COMPLETE |
58075 | 2013-08-06 00:00:00 | 1136 | COMPLETE |
%%sql
SELECT *
FROM orders
WHERE order_status = 'COMPLETE'
AND order_date = '2014-01-01 00:00:00.0'
LIMIT 10;
order_id | order_date | order_customer_id | order_status |
---|---|---|---|
25882 | 2014-01-01 00:00:00 | 4598 | COMPLETE |
25895 | 2014-01-01 00:00:00 | 1044 | COMPLETE |
25888 | 2014-01-01 00:00:00 | 6735 | COMPLETE |
25901 | 2014-01-01 00:00:00 | 3099 | COMPLETE |
25920 | 2014-01-01 00:00:00 | 12232 | COMPLETE |
25921 | 2014-01-01 00:00:00 | 12373 | COMPLETE |
25936 | 2014-01-01 00:00:00 | 3057 | COMPLETE |
25962 | 2014-01-01 00:00:00 | 5170 | COMPLETE |
25980 | 2014-01-01 00:00:00 | 363 | COMPLETE |
25924 | 2014-01-01 00:00:00 | 3826 | COMPLETE |
%%sql
SELECT *
FROM orders
WHERE order_status = 'COMPLETE'
AND order_date LIKE '2014-01%'
LIMIT 10;
order_id | order_date | order_customer_id | order_status |
---|---|---|---|
26032 | 2014-01-02 00:00:00 | 2424 | COMPLETE |
26596 | 2014-01-05 00:00:00 | 6998 | COMPLETE |
62102 | 2014-01-09 00:00:00 | 6176 | COMPLETE |
27823 | 2014-01-13 00:00:00 | 3017 | COMPLETE |
28732 | 2014-01-18 00:00:00 | 5654 | COMPLETE |
28915 | 2014-01-19 00:00:00 | 166 | COMPLETE |
29147 | 2014-01-21 00:00:00 | 6118 | COMPLETE |
29649 | 2014-01-24 00:00:00 | 8244 | COMPLETE |
29858 | 2014-01-26 00:00:00 | 5670 | COMPLETE |
30194 | 2014-01-28 00:00:00 | 9135 | COMPLETE |
%%sql
SELECT *
FROM orders
WHERE order_status IN ('COMPLETE', 'CLOSED')
LIMIT 10;
order_id | order_date | order_customer_id | order_status |
---|---|---|---|
80 | 2013-07-25 00:00:00 | 3007 | COMPLETE |
771 | 2013-07-29 00:00:00 | 9026 | COMPLETE |
782 | 2013-07-29 00:00:00 | 12213 | COMPLETE |
972 | 2013-07-30 00:00:00 | 10255 | CLOSED |
58029 | 2013-08-04 00:00:00 | 2863 | COMPLETE |
2950 | 2013-08-10 00:00:00 | 3757 | COMPLETE |
3343 | 2013-08-12 00:00:00 | 12289 | CLOSED |
3401 | 2013-08-13 00:00:00 | 3038 | COMPLETE |
58244 | 2013-08-13 00:00:00 | 2316 | CLOSED |
3477 | 2013-08-14 00:00:00 | 6898 | COMPLETE |
%%sql
SELECT *
FROM orders
WHERE order_status IN ('COMPLETE', 'CLOSED')
AND order_date LIKE '2014-01%'
LIMIT 10;
order_id | order_date | order_customer_id | order_status |
---|---|---|---|
25990 | 2014-01-02 00:00:00 | 4870 | CLOSED |
26357 | 2014-01-04 00:00:00 | 11733 | COMPLETE |
26460 | 2014-01-05 00:00:00 | 7582 | COMPLETE |
26995 | 2014-01-09 00:00:00 | 7456 | COMPLETE |
27066 | 2014-01-09 00:00:00 | 2994 | CLOSED |
27188 | 2014-01-10 00:00:00 | 12389 | COMPLETE |
27464 | 2014-01-11 00:00:00 | 7274 | COMPLETE |
27911 | 2014-01-13 00:00:00 | 8571 | COMPLETE |
27949 | 2014-01-14 00:00:00 | 11080 | CLOSED |
68784 | 2014-01-14 00:00:00 | 10349 | COMPLETE |
Total Aggregations using AWS Redshift SQL
%%sql
WITH
sales AS (
SELECT 1 AS sale_id, 1 AS course_id, 10.99 AS sale_amount
UNION ALL
SELECT 2, 2, 9.99
UNION ALL
SELECT 3, 1, 10.99
UNION ALL
SELECT 4, 3, 12.99
UNION ALL
SELECT 5, 3, 12.99
UNION ALL
SELECT 6, 1, 9.99
UNION ALL
SELECT 7, 2, 10.99
UNION ALL
SELECT 8, 2, 9.99
UNION ALL
SELECT 9, 4, 11.99
UNION ALL
SELECT 10, 1, 13.99
)
SELECT * FROM sales;
sale_id | course_id | sale_amount |
---|---|---|
1 | 1 | 10.99 |
2 | 2 | 9.99 |
3 | 1 | 10.99 |
4 | 3 | 12.99 |
5 | 3 | 12.99 |
6 | 1 | 9.99 |
7 | 2 | 10.99 |
8 | 2 | 9.99 |
9 | 4 | 11.99 |
10 | 1 | 13.99 |
%%sql
SELECT count(distinct order_date),
count(distinct order_status)
FROM orders;
count | count_1 |
---|---|
364 | 9 |
%%sql
SELECT *
FROM order_items
LIMIT 10;
order_item_id | order_item_order_id | order_item_product_id | order_item_quantity | order_item_subtotal | order_item_product_price |
---|---|---|---|---|---|
1 | 1 | 957 | 1 | 299.98 | 299.98 |
61 | 20 | 1014 | 4 | 199.92 | 49.98 |
148 | 61 | 191 | 4 | 399.96 | 99.99 |
434 | 180 | 403 | 1 | 129.99 | 129.99 |
504 | 207 | 191 | 1 | 99.99 | 99.99 |
522 | 214 | 191 | 2 | 199.98 | 99.99 |
590 | 239 | 627 | 2 | 79.98 | 39.99 |
752 | 300 | 116 | 1 | 44.99 | 44.99 |
877 | 353 | 365 | 3 | 179.97 | 59.99 |
946 | 381 | 810 | 5 | 99.95 | 19.99 |
%%sql
SELECT round(sum(order_item_subtotal), 2)
FROM order_items
WHERE order_item_order_id = 2;
round |
---|
579.98 |
%%sql
SELECT count(*) item_count,
round(sum(order_item_subtotal), 2) total_revenue,
round(avg(order_item_subtotal), 2) avg_revenue
FROM order_items
WHERE order_item_order_id = 2;
item_count | total_revenue | avg_revenue |
---|---|---|
3 | 579.98 | 193.33 |