Movie Sentiment Analysis
Setup the environment
!mkdir -p ~/.aws
%%writefile ~/.aws/credentials
[default]
aws_access_key_id=
aws_secret_access_key=
region=us-east-1
output=json
!pip install -q -U requests
!pip install -q -U boto3
!pip install -q -U ipython-sql
!pip install -q -U psycopg2-binary
!pip install -q -U matplotlib
!pip install -q -U reportlab
import boto3
import zipfile
import json
import os
import io
import sys
import time
import psycopg2
import requests
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
from urllib.request import urlopen
from urllib.parse import urlparse
from tqdm.auto import tqdm
%reload_ext sql
Connect to the Redshift cluster
def get_secret(secret_name):
region_name = "us-east-1"
session = boto3.session.Session()
client = session.client(
service_name='secretsmanager',
region_name=region_name)
get_secret_value_response = client.get_secret_value(SecretId=secret_name)
get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
return get_secret_value_response
db_credentials = get_secret(secret_name='wysde')
USERNAME = db_credentials["REDSHIFT_USERNAME"]
PASSWORD = db_credentials["REDSHIFT_PASSWORD"]
HOST = db_credentials["REDSHIFT_HOST"]
PORT = db_credentials["REDSHIFT_PORT"]
DBNAME = "dev"
CONN = f"postgresql://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/{DBNAME}"
%sql {CONN}
iam_role = 'arn:aws:iam::684199068947:role/service-role/AmazonRedshift-CommandsAccessRole-20220921T223853'
staging_bucket = 'wysde2'
URLs and Paths
# Movies Source And Staging Folder
tmdb_movies_source_url = 'https://hudsonmendes-datalake.s3.eu-west-2.amazonaws.com/kaggle/hudsonmendes/tmdb-movies-with-imdb_id.zip'
tmdb_movies_staging_folder = f'tmdb-movies-with-imdb_id/'
# Reviews Source and Staging Folder
tmdb_reviews_source_url = 'https://hudsonmendes-datalake.s3.eu-west-2.amazonaws.com/kaggle/hudsonmendes/tmdb-reviews.zip'
tmdb_reviews_staging_folder = f'tmdb-reviews'
# IMDb Cast (Links)
imdb_cast_source_url = 'https://datasets.imdbws.com/title.principals.tsv.gz'
imdb_cast_staging_path = f'imdb-cast/imdb-cast-{int(time.time())}.tsv.gz'
# IMDb Cast (Names)
imdb_names_source_url = 'https://datasets.imdbws.com/name.basics.tsv.gz'
imdb_names_staging_path = f'imdb-cast/imdb-cast-names{int(time.time())}.tsv.gz'
Pipeline
This Pipeline is composed of the following steps:
- Unzip files from the Data Lake into our Staging S3
- COPY data from Staging S3 to out Redshift Staging Tables
- Extract, Transform and Load the data into our Data Warehouse Dimensions Table
- Classify Sentiment of Movie Reviews using our Model, generating our Data Warehouse Facts Table
Unzip files from the Data Lake into our Staging S3
# Upload files within ZIP to S3 Staging
def stage_zip_files_to_s3(
source_url,
destination_bucket,
destination_folder):
s3 = boto3.client('s3')
print(f'Downloading "{source_url}", please wait...')
with urlopen(source_url) as res:
buffer = io.BytesIO(res.read())
file_zip = zipfile.ZipFile(buffer)
print('Download completed.')
print(f'Uploading each file from "{source_url}" to s3://{destination_bucket}/{destination_folder}')
for inner_file_name in tqdm(file_zip.namelist(), 'extracting to s3'):
inner_file_buffer = file_zip.read(inner_file_name)
s3.put_object(
Bucket=staging_bucket,
Key=os.path.join(destination_folder, inner_file_name),
Body=inner_file_buffer)
print(f'[ok] {inner_file_name}')
stage_zip_files_to_s3(
source_url=tmdb_movies_source_url,
destination_bucket=staging_bucket,
destination_folder=tmdb_movies_staging_folder)
# Downloading "https://hudsonmendes-datalake.s3.eu-west-2.amazonaws.com/kaggle/hudsonmendes/tmdb-movies-with-imdb_id.zip", please wait...
# Download completed.
# Uploading each file in "https://hudsonmendes-datalake.s3.eu-west-2.amazonaws.com/kaggle/hudsonmendes/tmdb-movies-with-imdb_id.zip" to s3://wysde2/tmdb-movies-with-imdb_id/
# extracting to s3: 0%| | 0/21 [00:00<?, ?it/s]
# [ok] tmdb-movies-2000.json
# [ok] tmdb-movies-2001.json
# [ok] tmdb-movies-2002.json
# [ok] tmdb-movies-2003.json
# [ok] tmdb-movies-2004.json
# [ok] tmdb-movies-2005.json
# [ok] tmdb-movies-2006.json
# [ok] tmdb-movies-2007.json
# [ok] tmdb-movies-2008.json
# [ok] tmdb-movies-2009.json
# [ok] tmdb-movies-2010.json
# [ok] tmdb-movies-2011.json
# [ok] tmdb-movies-2012.json
# [ok] tmdb-movies-2013.json
# [ok] tmdb-movies-2014.json
# [ok] tmdb-movies-2015.json
# [ok] tmdb-movies-2016.json
# [ok] tmdb-movies-2017.json
# [ok] tmdb-movies-2018.json
# [ok] tmdb-movies-2019.json
# [ok] tmdb-movies-2020.json
stage_zip_files_to_s3(
source_url=tmdb_reviews_source_url,
destination_bucket=staging_bucket,
destination_folder=tmdb_reviews_staging_folder)
# Downloading "https://hudsonmendes-datalake.s3.eu-west-2.amazonaws.com/kaggle/hudsonmendes/tmdb-reviews.zip", please wait...
# Download completed.
# Uploading each file in "https://hudsonmendes-datalake.s3.eu-west-2.amazonaws.com/kaggle/hudsonmendes/tmdb-reviews.zip" to s3://wysde2/tmdb-reviews
# extracting to s3: 0%| | 0/21 [00:00<?, ?it/s]
# [ok] tmdb-movies-2000-reviews.json
# [ok] tmdb-movies-2001-reviews.json
# [ok] tmdb-movies-2002-reviews.json
# [ok] tmdb-movies-2003-reviews.json
# [ok] tmdb-movies-2004-reviews.json
# [ok] tmdb-movies-2005-reviews.json
# [ok] tmdb-movies-2006-reviews.json
# [ok] tmdb-movies-2007-reviews.json
# [ok] tmdb-movies-2008-reviews.json
# [ok] tmdb-movies-2009-reviews.json
# [ok] tmdb-movies-2010-reviews.json
# [ok] tmdb-movies-2011-reviews.json
# [ok] tmdb-movies-2012-reviews.json
# [ok] tmdb-movies-2013-reviews.json
# [ok] tmdb-movies-2014-reviews.json
# [ok] tmdb-movies-2015-reviews.json
# [ok] tmdb-movies-2016-reviews.json
# [ok] tmdb-movies-2017-reviews.json
# [ok] tmdb-movies-2018-reviews.json
# [ok] tmdb-movies-2019-reviews.json
# [ok] tmdb-movies-2020-reviews.json
Upload .tsv.gz to S3 Staging
import os
import sys
import threading
class ProgressPercentage(object):
def __init__(self, buffer):
self.buffer = buffer
self._size = sys.getsizeof(buffer)
self.pbar = tqdm(total=self._size, unit='B', unit_scale=True, desc='uploading')
self._lock = threading.Lock()
def __call__(self, bytes_amount):
with self._lock:
self.pbar.update(bytes_amount)
def close(self):
self.pbar.update(self._size - self.pbar.n)
self.pbar.close()
def stage_file_to_s3(
source_url,
destination_bucket,
destination_path):
s3 = boto3.client('s3')
file_name = os.path.basename(source_url)
file_size = int(urlopen(source_url).info().get('Content-Length', -1))
pbar = tqdm(total=file_size, unit='B', unit_scale=True, desc='downloading')
req = requests.get(source_url, stream=True)
buffer = io.BytesIO()
for chunk in req.iter_content(chunk_size=1024):
if chunk:
buffer.write(chunk)
pbar.update(1024)
pbar.close()
pbar = ProgressPercentage(buffer)
buffer.seek(0)
s3.upload_fileobj(
Fileobj=buffer,
Bucket=staging_bucket,
Key=destination_path,
Callback=pbar)
pbar.close()
print(f'[ok] {file_name}')
stage_file_to_s3(
source_url=imdb_cast_source_url,
destination_bucket=staging_bucket,
destination_path=imdb_cast_staging_path)
# downloading: 0%| | 0.00/415M [00:00<?, ?B/s]
# uploading: 0%| | 0.00/434M [00:00<?, ?B/s]
# [ok] title.principals.tsv.gz
stage_file_to_s3(
source_url=imdb_names_source_url,
destination_bucket=staging_bucket,
destination_path=imdb_names_staging_path)
# downloading: 0%| | 0.00/235M [00:00<?, ?B/s]
# uploading: 0%| | 0.00/241M [00:00<?, ?B/s]
# [ok] name.basics.tsv.gz
From S3 Staging to Reshift Staging Tables
IMDb Movies
tmdb_movies_staging_url = f's3://{staging_bucket}/{tmdb_movies_staging_folder}'
%%sql
drop table if exists staging_tmdb_movies;
create table staging_tmdb_movies (
id integer,
video boolean,
vote_count bigint,
vote_average numeric(10, 6),
title varchar(256),
release_date timestamp,
original_language varchar(10),
original_title varchar(256),
genre_ids varchar(1024),
backdrop_path varchar(1024),
adult boolean,
overview varchar(10000),
poster_path varchar(1024),
popularity numeric(10, 6),
id_imdb varchar(32)
);
copy public.staging_tmdb_movies
from :tmdb_movies_staging_url
iam_role :iam_role
region 'us-east-1'
format as json 'auto';
%sql select count(*) from public.staging_tmdb_movies;
count |
---|
109222 |
IMDb Movie Reviews
tmdb_reviews_staging_url = f's3://{staging_bucket}/{tmdb_reviews_staging_folder}'
%%sql
drop table if exists staging_tmdb_reviews;
create table staging_tmdb_reviews (
author varchar(256),
content varchar(40000),
id varchar(40),
url varchar(256),
movie_id integer
);
copy public.staging_tmdb_reviews
from :tmdb_reviews_staging_url
iam_role :iam_role
region 'us-east-1'
format as json 'auto';
%sql select count(*) from public.staging_tmdb_reviews;
count |
---|
7153 |
IMDb Cast (Links)
imdb_cast_staging_url = f's3://{staging_bucket}/{imdb_cast_staging_path}'
%%sql
drop table if exists staging_imdb_cast;
create table staging_imdb_cast (
tconst varchar(40),
ordering varchar(10),
nconst varchar(40),
category varchar(256),
job varchar(1024),
characters varchar(2048)
);
copy public.staging_imdb_cast
from :imdb_cast_staging_url
iam_role :iam_role
region 'us-east-1'
delimiter '\t'
gzip;
%sql select count(*) from public.staging_imdb_cast;
count |
---|
52697448 |
IMDb Cast (Names)
imdb_names_staging_url = f's3://{staging_bucket}/{imdb_names_staging_path}'
%%sql
drop table if exists staging_imdb_names;
create table staging_imdb_names (
nconst varchar(40),
primaryname varchar(256),
birthyear varchar(10),
deathyear varchar(10),
primaryprofession varchar(256),
knownfortitles varchar(256)
);
copy public.staging_imdb_names
from :imdb_names_staging_url
iam_role :iam_role
region 'us-east-1'
delimiter '\t'
gzip;
%sql select count(*) from public.staging_imdb_names;
count |
---|
12022550 |
From Redshift Staging to Dimension Tables
Creating Tables
%%sql
drop table if exists public.dim_dates;
create table if not exists public.dim_dates
(
date_id timestamp without time zone not null,
year integer not null,
month integer not null,
day integer,
constraint dim_dates_pkey primary key (date_id)
);
drop table if exists public.dim_films;
create table if not exists public.dim_films
(
film_id varchar(32) not null,
date_id timestamp not null,
title varchar(256) not null,
constraint dim_films_pkey primary key (film_id),
constraint dim_films_fkey_dates foreign key (date_id) references dim_dates (date_id)
);
drop table if exists public.dim_cast;
create table if not exists public.dim_cast
(
cast_id varchar(32) not null,
film_id varchar(32) not null,
full_name varchar(256) not null,
constraint dim_cast_pkey primary key (cast_id),
constraint dim_cast_fkey_films foreign key (film_id) references dim_films (film_id)
);
drop table if exists public.dim_reviews;
create table if not exists public.dim_reviews
(
review_id varchar(32) not null,
film_id varchar(32) not null,
text varchar(40000) not null,
constraint dim_reviews_pkey primary key (review_id),
constraint dim_reviews_fkey_films foreign key (film_id) references dim_films (film_id)
);
drop table if exists public.fact_film_review_sentiments;
create table if not exists public.fact_film_review_sentiments
(
date_id timestamp without time zone not null,
film_id varchar(32) not null,
review_id varchar(32) not null,
review_sentiment_class integer not null,
constraint fact_films_review_sentiments_pkey primary key (date_id, film_id, review_id),
constraint fact_films_review_sentiments_fkey_dates foreign key (date_id) references dim_dates (date_id),
constraint fact_films_review_sentiments_fkey_films foreign key (film_id) references dim_films (film_id),
constraint fact_films_review_sentiments_fkey_reviews foreign key (review_id) references dim_reviews (review_id)
);
drop table if exists public.fact_cast_review_sentiments;
create table if not exists public.fact_cast_review_sentiments
(
date_id timestamp without time zone not null,
cast_id varchar(32) not null,
review_id varchar(32) not null,
review_sentiment_class integer not null,
constraint fact_cast_review_sentiments_pkey primary key (date_id, cast_id, review_id),
constraint fact_cast_review_sentiments_fkey_dates foreign key (date_id) references dim_dates (date_id),
constraint fact_cast_review_sentiments_fkey_cast foreign key (cast_id) references dim_cast (cast_id),
constraint fact_cast_review_sentiments_fkey_reviews foreign key (review_id) references dim_reviews (review_id)
);
dim_dates
%%sql
truncate table dim_dates;
insert into dim_dates
select release_date as date_id,
datepart(year, release_date) as year,
datepart(month, release_date) as month,
datepart(day, release_date) as day
from (
select distinct release_date
from staging_tmdb_movies
where not release_date is null);
dim_films
%%sql
truncate table dim_films;
insert into dim_films (film_id, date_id, title)
select id, release_date, title
from staging_tmdb_movies
where not release_date is null;
dim_cast
%%sql
truncate table dim_cast;
insert into dim_cast (cast_id, film_id, full_name)
select imdbc.nconst, tmdbm.id, imdbn.primaryname
from staging_imdb_cast as imdbc
inner join staging_imdb_names as imdbn on imdbc.nconst = imdbn.nconst
inner join staging_tmdb_movies as tmdbm on tmdbm.id_imdb = imdbc.tconst
where not release_date is null
and imdbc.category in ('actor', 'actress');
dim_reviews
%%sql
truncate table dim_reviews;
insert into dim_reviews (review_id, film_id, text)
select id, movie_id, content
from staging_tmdb_reviews;
fact_film_review_sentiments
%%sql
truncate table fact_film_review_sentiments;
insert into fact_film_review_sentiments (
date_id,
film_id,
review_id,
review_sentiment_class)
select df.date_id, df.film_id, dr.review_id, 0
from dim_films as df
inner join dim_reviews as dr on df.film_id = dr.film_id
fact_cast_review_sentiments
%%sql
truncate table fact_cast_review_sentiments;
insert into fact_cast_review_sentiments (
date_id,
cast_id,
review_id,
review_sentiment_class)
select df.date_id, dc.cast_id, dr.review_id, 0
from dim_cast as dc
inner join dim_films as df on dc.film_id = df.film_id
inner join dim_reviews as dr on dc.film_id = dr.film_id
Sentiment Classification
!wget -q --show-progress https://github.com/datalaker/data-engineering-shared/raw/main/models/movie-sentiment-review-model.tar.gz
!tar -xvf movie-sentiment-review-model.tar.gz
movie-sentiment-rev 100%[===================>] 7.84M --.-KB/s in 0.09s
movie-sentiment-classifier-model/
movie-sentiment-classifier-model/variables/
movie-sentiment-classifier-model/saved_model.pb
movie-sentiment-classifier-model/assets/
movie-sentiment-classifier-model/assets/tokenizer.json
movie-sentiment-classifier-model/variables/variables.data-00000-of-00001
movie-sentiment-classifier-model/variables/variables.index
import tensorflow as tf
from tensorflow import keras
model = tf.saved_model.load('./movie-sentiment-classifier-model')
tokenizer = None
with open('./movie-sentiment-classifier-model/assets/tokenizer.json', 'r', encoding='utf-8') as tokenizer_file:
tokenizer_json = json.dumps(json.load(tokenizer_file))
tokenizer = keras.preprocessing.text.tokenizer_from_json(tokenizer_json)
db = psycopg2.connect(CONN)
reviews = []
cur = db.cursor()
cur.execute('select review_id, text from dim_reviews')
for review_id, text in tqdm(cur.fetchall(), desc='reviews'):
reviews.append({'review_id': review_id, 'text': text })
cur.close()
review_texts = [ r['text'] for r in reviews ]
review_seqs = tokenizer.texts_to_sequences(review_texts)
review_seqs = tf.keras.preprocessing.sequence.pad_sequences(review_seqs, maxlen=500, dtype='float32', padding='post', value=0)
(len(reviews), len(review_texts), review_seqs.shape)
reviews: 0%| | 0/7153 [00:00<?, ?it/s]
(7153, 7153, (7153, 500))
review_preds = model(inputs=review_seqs)
len(review_preds)
def update_review_sentiment_class_for(table):
cur = db.cursor()
try:
sql = f"""update {table}
set review_sentiment_class = %s
where review_id = %s"""
batch = []
pbar = tqdm(enumerate(review_preds), total=len(review_preds), desc=table)
for i, review_pred in pbar:
review_id = reviews[i]['review_id']
review_sentiment = -1 if np.argmax(review_preds[i]) else 1
batch.append((review_sentiment, review_id))
if len(batch) % 200 == 0:
cur.executemany(sql, batch)
db.commit()
batch.clear()
pbar.refresh()
if len(batch) > 0:
cur.executemany(sql, batch)
db.commit()
batch.clear()
pbar.refresh()
pbar.close()
cur.close()
except Exception as e:
db.rollback()
raise e
update_review_sentiment_class_for('fact_film_review_sentiments')
fact_film_review_sentiments: 0%| | 0/7153 [00:00<?, ?it/s]
update_review_sentiment_class_for('fact_cast_review_sentiments')
fact_cast_review_sentiments: 0%| | 0/7153 [00:00<?, ?it/s]
Quality Checks
Counts
Dates
%%sql dates_source_count <<
select count(distinct release_date)
from staging_tmdb_movies
%%sql dates_dest_count <<
select count(date_id)
from dim_dates
print((dates_source_count[0][0], dates_dest_count[0][0]))
assert dates_source_count == dates_dest_count
(7526, 7526)
Films
%%sql films_source_count <<
select count(id_imdb)
from staging_tmdb_movies
where not release_date is null
%%sql films_dest_count <<
select count(film_id)
from dim_films
print((films_source_count[0][0], films_dest_count[0][0]))
assert films_source_count == films_dest_count
(103780, 103780)
Reviews
%%sql reviews_source_count <<
select count(id)
from staging_tmdb_reviews
%%sql reviews_dest_count <<
select count(review_id)
from dim_reviews
print((reviews_source_count[0][0], reviews_dest_count[0][0]))
assert reviews_source_count == reviews_dest_count
(7153, 7153)
Cast
%%sql cast_source_count <<
select count(imdbc.nconst)
from staging_imdb_cast as imdbc
inner join staging_tmdb_movies as tmdbm on imdbc.tconst = tmdbm.id_imdb
inner join staging_imdb_names as imdbn on imdbc.nconst = imdbn.nconst
where not release_date is null
and imdbc.category in ('actor', 'actress');
%%sql cast_dest_count <<
select count(cast_id)
from dim_cast
print((cast_source_count[0][0], cast_dest_count[0][0]))
assert cast_source_count == cast_dest_count
(348800, 348800)
Film Facts
%%sql film_fact_source_count <<
select count(0)
from dim_films as df
inner join dim_reviews as dr on df.film_id = dr.film_id
%%sql film_fact_dest_count <<
select count(0)
from fact_film_review_sentiments
print((film_fact_source_count[0][0], film_fact_dest_count[0][0]))
assert film_fact_source_count == film_fact_dest_count
(7153, 7153)
Cast Facts
%%sql cast_fact_source_count <<
select count(0)
from dim_cast as dc
inner join dim_reviews as dr on dc.film_id = dr.film_id
%%sql cast_fact_dest_count <<
select count(0)
from fact_cast_review_sentiments
print((cast_fact_source_count[0][0], cast_fact_dest_count[0][0]))
assert cast_fact_source_count == cast_fact_dest_count
(27810, 27810)
Existence
%%sql films_not_in_facts <<
select df.film_id
from dim_films as df
inner join dim_reviews as dr on df.film_id = dr.film_id
where df.film_id not in (select f.film_id from fact_film_review_sentiments as f)
assert not films_not_in_facts
%%sql reviews_not_in_facts <<
select dr.review_id
from dim_reviews as dr
where dr.review_id not in (select f.review_id from fact_film_review_sentiments as f)
assert not reviews_not_in_facts
%%sql cast_not_in_facts <<
select dc.cast_id
from dim_cast as dc
inner join dim_reviews as dr on dc.film_id = dr.film_id
where dc.cast_id not in (select f.cast_id from fact_cast_review_sentiments as f)
assert not cast_not_in_facts
Ranges
%%sql sentiment_classes <<
select distinct review_sentiment_class
from fact_film_review_sentiments
union
select distinct review_sentiment_class
from fact_cast_review_sentiments
print(set([ x[0] for x in sentiment_classes ]))
assert set([ x[0] for x in sentiment_classes ]) == set([0])
# {0}
Analysis
Top 10 Films
%%sql
select df.title, sum(f.review_sentiment_class) as sentiment
from fact_film_review_sentiments as f
inner join dim_films as df on f.film_id = df.film_id
inner join dim_reviews as dr on f.review_id = dr.review_id
group by df.title
order by sentiment desc
limit 10;
title | sentiment |
---|---|
Spider-Man: Into the Spider-Verse | 38 |
The Avengers | 31 |
Avengers: Age of Ultron | 30 |
Spider-Man | 23 |
Avengers: Infinity War | 23 |
Spider-Man 2 | 19 |
Thor | 18 |
Assassin 33 A.D. | 18 |
Big Hero 6 | 14 |
Doctor Strange | 14 |
10 Worst Films
%%sql
select df.title, sum(f.review_sentiment_class) as sentiment
from fact_film_review_sentiments as f
inner join dim_films as df on f.film_id = df.film_id
inner join dim_reviews as dr on f.review_id = dr.review_id
group by df.title
order by sentiment asc
limit 10;
title | sentiment |
---|---|
Thor: The Dark World | -12 |
Godzilla: King of the Monsters | -6 |
Suicide Squad | -6 |
The Forest | -5 |
The Mummy | -5 |
Underworld: Blood Wars | -5 |
The Grudge | -5 |
Star Wars: The Rise of Skywalker | -5 |
Godzilla | -5 |
Star Wars: The Last Jedi | -5 |
Top 10 Actors in Films with Best Reviews
%%sql
select dc.full_name, sum(f.review_sentiment_class) as sentiment
from fact_cast_review_sentiments as f
inner join dim_cast as dc on f.cast_id = dc.cast_id
inner join dim_reviews as dr on f.review_id = dr.review_id
group by dc.full_name
order by sentiment desc
limit 10;
full_name | sentiment |
---|---|
Chris Evans | 3654 |
James Franco | 3038 |
Robert Downey Jr. | 2912 |
Mark Ruffalo | 2838 |
Scarlett Johansson | 2368 |
Willem Dafoe | 1848 |
Chris Hemsworth | 1530 |
Samuel L. Jackson | 1456 |
Prakash Raj | 1400 |
Hugh Jackman | 1170 |
Sentiment Over Time
%%sql
select dt.year, sum(f.review_sentiment_class) as sentiment
from fact_film_review_sentiments as f
inner join dim_dates as dt on f.date_id = dt.date_id
group by dt.year
order by dt.year asc;
year | sentiment |
---|---|
2000 | 48 |
2001 | 22 |
2002 | 57 |
2003 | 37 |
2004 | 42 |
2005 | 28 |
2006 | 48 |
2007 | 41 |
2008 | 59 |
2009 | 65 |
2010 | 75 |
2011 | 69 |
2012 | 149 |
2013 | 78 |
2014 | 192 |
2015 | 313 |
2016 | 233 |
2017 | 89 |
2018 | 148 |
2019 | 194 |
2020 | 34 |
Reporting
Sentiment Normalization
sentiment_scale = 2.5
def sentiment_normalizer(max_sentiment):
return lambda x: round(sentiment_scale + (sentiment_scale * x / max_sentiment), 2)
Sentiment Per Films
%%sql film_sentiments <<
select df.title, sum(f.review_sentiment_class) as sentiment
from fact_film_review_sentiments as f
inner join dim_films as df on f.film_id = df.film_id
group by df.title
order by sentiment desc;
df_films = pd.DataFrame(film_sentiments, columns=['title', 'sentiment'])
df_films = df_films.set_index('title')
df_films.describe()
df_films_normalizer = sentiment_normalizer(max(df_films.sentiment))
df_films['normalized_sentiment'] = df_films['sentiment'].map(df_films_normalizer)
df_films.head()
sentiment | normalized_sentiment | |
---|---|---|
title | ||
Spider-Man: Into the Spider-Verse | 38 | 5.00 |
The Avengers | 31 | 4.54 |
Avengers: Age of Ultron | 30 | 4.47 |
Spider-Man | 23 | 4.01 |
Avengers: Infinity War | 23 | 4.01 |
Sentiment Per Actor
%%sql cast_sentiments <<
select dc.full_name, sum(f.review_sentiment_class) as sentiment
from fact_cast_review_sentiments f
inner join dim_cast as dc on f.cast_id = dc.cast_id
group by dc.full_name
order by sentiment desc;
* postgresql://admin:***@default.684199068947.us-east-1.redshift-serverless.amazonaws.com:5439/dev
9766 rows affected.
Returning data to local variable cast_sentiments
df_cast = pd.DataFrame(cast_sentiments, columns=['full_name', 'sentiment'])
df_cast = df_cast.set_index('full_name')
df_cast_normalizer = sentiment_normalizer(max(df_cast.sentiment))
df_cast['normalized_sentiment'] = df_cast['sentiment'].map(df_cast_normalizer)
df_cast.head()
sentiment | normalized_sentiment | |
---|---|---|
full_name | ||
Chris Evans | 3654 | 5.00 |
James Franco | 3038 | 4.58 |
Robert Downey Jr. | 2912 | 4.49 |
Mark Ruffalo | 2838 | 4.44 |
Scarlett Johansson | 2368 | 4.12 |
Sentiment Per Year
%%sql year_sentiments <<
select dt.year, sum(f.review_sentiment_class) as sentiment
from fact_film_review_sentiments as f
inner join dim_dates as dt on f.date_id = dt.date_id
group by dt.year
order by dt.year asc;
* postgresql://admin:***@default.684199068947.us-east-1.redshift-serverless.amazonaws.com:5439/dev
21 rows affected.
Returning data to local variable year_sentiments
df_year = pd.DataFrame(year_sentiments, columns=['year', 'sentiment'])
df_year = df_year.set_index('year')
df_year.transpose()
year | 2000 | 2001 | 2002 | 2003 | 2004 | 2005 | 2006 | 2007 | 2008 | 2009 | ... | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
sentiment | 48 | 22 | 57 | 37 | 42 | 28 | 48 | 41 | 59 | 65 | ... | 69 | 149 | 78 | 192 | 313 | 233 | 89 | 148 | 194 | 34 |
from datetime import datetime
from reportlab.lib.units import cm
from reportlab.lib.utils import ImageReader
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph, Image, Table, TableStyle, Spacer, PageBreak
from reportlab.lib.colors import black
year = str(datetime.now().year).rjust(4, '0')
month = str(datetime.now().month).rjust(2, '0')
day = str(datetime.now().day).rjust(2, '0')
title = f'TMDb, Film Review Sentiment Analysis ({year}-{month}-{day})'
os.makedirs('images', exist_ok=True)
# Chart: Review Distribution per Films
film_review_distro_path = './images/film_review_distro_fig.png'
film_review_distro_fig = df_films[['sentiment']] \
.plot \
.density(bw_method=1, grid=True, figsize=(7, 3)) \
.get_figure()
film_review_distro_fig.savefig(film_review_distro_path, format='png')
# Chart: Sentiment over Time
year_review_distro_path = './images/year_review_distro_fig.png'
year_review_distro_fig = df_year \
.plot \
.bar(grid=True, figsize=(7, 4)) \
.get_figure()
year_review_distro_fig.savefig(year_review_distro_path, format='png')
# Table: Top 10 Films
top_10_films= df_films[['normalized_sentiment']].head(10)
# Table: Worst 10 Films
worst_10_films= df_films[['normalized_sentiment']] \
.tail(10) \
.sort_values(by='normalized_sentiment', ascending=True)
# Table: Top 10 Cast
top_10_cast = df_cast[['normalized_sentiment']].head(10)
!wget -q --show-progress https://github.com/datalaker/data-engineering-shared/raw/main/images/the_movie_db.png -O images/header.png
images/header.png 100%[===================>] 176.62K --.-KB/s in 0.02s
doc = SimpleDocTemplate('report.pdf', pagesize=A4, rightMargin=cm, leftMargin=cm, topMargin=cm, bottomMargin=cm)
doc.title = title
width, height = A4
style_title = getSampleStyleSheet()["title"]
style_h1 = getSampleStyleSheet()["h1"]
style_normal = getSampleStyleSheet()["bu"]
style_grid = TableStyle([
('GRID', (0, 0), (-1, -1), 1, black),
('ALIGN', (1, 0), (-1, -1), 'RIGHT')])
br = Spacer(width, 20)
elements = []
elements.append(Paragraph(title, style=style_title))
elements.append(Image('./images/header.png', width-(2*cm), 220))
elements.append(br)
elements.append(Paragraph('Executive Summary', style=style_h1))
elements.append(Paragraph(f'The top film in our database, accorindg to TMDB reviews is <strong>{df_films.head(1).index[0]}</strong>', style=style_normal))
elements.append(br)
elements.append(Paragraph('Top 10 Films', style=style_h1))
elements.append(Paragraph('Here are the top 10 films in our database, according to the sentiment found in the TMDb reviews, ranging from 0 (negative) to 5 (positive).', style=style_normal))
elements.append(Table(top_10_films.copy().reset_index().to_numpy().tolist(), style=style_grid))
elements.append(br)
elements.append(Paragraph('Worst 10 Films', style=style_h1))
elements.append(Paragraph('Here are the worst 10 films in our database, according to the sentiment found in the TMDb reviews, ranging from 0 (negative) to 5 (positive).', style=style_normal))
elements.append(Table(worst_10_films.copy().reset_index().to_numpy().tolist(), style=style_grid))
elements.append(br)
elements.append(Paragraph('Review Sentiment Distibution', style=style_h1))
elements.append(Image(film_review_distro_path))
elements.append(br)
elements.append(Paragraph('Top 10 Actors/Actresses in Best Reviewed Films', style=style_h1))
elements.append(Paragraph('The ranking bellow is of actors that worked in films with positive reviews. Reviews are not made directly to actors, but to their films', style=style_normal))
elements.append(Table(top_10_cast.copy().reset_index().to_numpy().tolist(), style=style_grid))
elements.append(br)
elements.append(Paragraph('IMDb Average Voting vs TMDb Sentiment Reviews', style=style_h1))
elements.append(Image(year_review_distro_path))
doc.build(elements)
!pip install -qq watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d
# Compiler : GCC 7.5.0
# OS : Linux
# Release : 5.10.133+
# Machine : x86_64
# Processor : x86_64
# CPU cores : 2
# Architecture: 64bit
# json : 2.0.9
# tensorflow: 2.9.2
# sys : 3.7.15 (default, Oct 12 2022, 19:14:55)
# [GCC 7.5.0]
# psycopg2 : 2.9.4
# numpy : 1.21.6
# pandas : 1.3.5
# matplotlib: 3.5.3
# requests : 2.28.1
# boto3 : 1.24.96