In [1]:
import sqlite3
import pandas as pd
con = sqlite3.connect('f19_day15.sqlite')
def x(q):
    return pd.read_sql_query(q,con)
In [4]:
x('select * from reviewer limit 6')
Out[4]:
rid firstname lastname
0 777 John Ringland
1 102 Tong Wu
2 3333 Chima Ezeilo
3 716 Ashok Kumar
4 515 Ji Ho Lee
5 1570 Tianmou Liu
In [5]:
# How many people actually provided a review?

q = 'select count(*) from rating'  # wrong
x(q)
Out[5]:
count(*)
0 179
In [6]:
q = 'select distinct rid from rating' 
x(q).head()
Out[6]:
rid
0 777
1 3333
2 1570
3 1123
4 170193
In [7]:
q = 'select count(distinct rid) from rating'  
x(q)
Out[7]:
count(distinct rid)
0 31
In [2]:
# who signed up as a reviewer but did not rate any movie?
signed_up = 'select rid from reviewer'
did_reviews = 'select distinct rid from rating'
q = 'select rid from (' + signed_up +') where rid not in (' + did_reviews + ')'
q2 = 'select firstname,lastname from (' + q + ') natural join reviewer'
#x(q2).head()
In [9]:
# Who rated Avatar?

q = 'select firstname,lastname from rating natural join movie natural join reviewer where title="Avatar"'
x(q)
Out[9]:
firstname lastname
0 Abhishek Mishra
1 Dorien Verbruggen
2 Eugen Feng
In [10]:
# Which movies were rated by more than one person?
# NOTE: use of "having", which is used to filter after a "group by"
q = 'select count(rid) as nrev,mid from rating group by mid  having nrev>1'
q2 = 'select title,nrev from (' + q + ') natural join movie'
x(q2).head()
Out[10]:
title nrev
0 Birdman 4
1 The Incredibles 3
2 Avengers: EndGame 5
3 Forrest Gump 4
4 Bicycle Thieves 2
In [11]:
# Who is the best director?
# i.e. who has the highest average of all the ratings of movies directed by him/her

# first get a table of average ratings grouped by director
q = 'select avg(stars) as avstars,director from rating natural join movie group by director having director != ""'
q2 = 'select * from (' + q + ') where avstars= (select max(avstars) from (' + q + '))'
print(q2)
x(q2).head()
select * from (select avg(stars) as avstars,director from rating natural join movie group by director having director != "") where avstars= (select max(avstars) from (select avg(stars) as avstars,director from rating natural join movie group by director having director != ""))
Out[11]:
avstars director
0 5.0 Akiro Kurosawa
1 5.0 Ang Lee
2 5.0 Bong Joon-ho
3 5.0 Christophe Barratier
4 5.0 Christopher McQuarrie

Getting rows where an extremum (min, max, or even avg, etc.) is attained

Since getting the rows where a certain field is extremized seems to be something we want to do very often, that is rather painful if we write the SQL directly, maybe we could write a function that makes it simple to generate a query that does that.

In [12]:
def extremizers( q , agg, field ):
    return 'select * from (' + q + ') where ' + field + '= (select ' + agg + '(' + field + ') from (' + q + '))'
extremizers( 'movie', 'max', 'year' )
Out[12]:
'select * from (movie) where year= (select max(year) from (movie))'
In [13]:
q = 'select avg(stars) as avstars,director from rating natural join movie group by director having director != ""'
x( extremizers( q, 'max', 'avstars') ).head()
Out[13]:
avstars director
0 5.0 Akiro Kurosawa
1 5.0 Ang Lee
2 5.0 Bong Joon-ho
3 5.0 Christophe Barratier
4 5.0 Christopher McQuarrie

We can use this in a nested way, like this:

In [14]:
# Of the directors with the highest average rating, which have the largest number of ratings?

q = 'select avg(stars) as avstars, count(stars) as nrev, director from rating natural join movie group by director having director != ""'
q2 = extremizers( q, 'max', 'avstars' )
q3 = extremizers( q2, 'max', 'nrev' )
x(q3)
Out[14]:
avstars nrev director
0 5.0 2 Jon Favreau
1 5.0 2 Ron Howard

Look how horrible the actual SQL is, we really wouldn't want to write this ourselves!:

In [15]:
q3
Out[15]:
'select * from (select * from (select avg(stars) as avstars, count(stars) as nrev, director from rating natural join movie group by director having director != "") where avstars= (select max(avstars) from (select avg(stars) as avstars, count(stars) as nrev, director from rating natural join movie group by director having director != ""))) where nrev= (select max(nrev) from (select * from (select avg(stars) as avstars, count(stars) as nrev, director from rating natural join movie group by director having director != "") where avstars= (select max(avstars) from (select avg(stars) as avstars, count(stars) as nrev, director from rating natural join movie group by director having director != ""))))'

SQL "with" construct

This actually is a way of defining a temporary variable in SQL.

In [16]:
q = 'select avg(stars) as avstars,director from rating natural join movie group by director having director != ""'
q2 = 'select * from (' + q + ') where avstars= (select max(avstars) from (' + q + '))'

q2 = 'with QQQ as (' + q + ') select * from QQQ where avstars= (select max(avstars) from QQQ)'
print(q2)
x(q2).head(8)
with QQQ as (select avg(stars) as avstars,director from rating natural join movie group by director having director != "") select * from QQQ where avstars= (select max(avstars) from QQQ)
Out[16]:
avstars director
0 5.0 Akiro Kurosawa
1 5.0 Ang Lee
2 5.0 Bong Joon-ho
3 5.0 Christophe Barratier
4 5.0 Christopher McQuarrie
5 5.0 Edward Yang
6 5.0 Gavin O'Connor
7 5.0 Giuseppe Tornatore

If we rewrite our extremizers function using "with", the resulting SQL will be shorter and a bit easier to comprehend:

In [17]:
def extremizers( q , agg, field ):
    return 'with QQQ as (' + q + ') select * from QQQ where ' + field + '=(select ' + agg + '(' + field + ') from QQQ)'
extremizers( 'movie', 'max', 'year' )
Out[17]:
'with QQQ as (movie) select * from QQQ where year=(select max(year) from QQQ)'

Redoing the best director query:

In [18]:
q = 'select avg(stars) as avstars, count(stars) as nrev, director from rating natural join movie group by director having director != ""'
q2 = extremizers( q, 'max', 'avstars' )
q3 = extremizers( q2, 'max', 'nrev' )
x(q3)
Out[18]:
avstars nrev director
0 5.0 2 Jon Favreau
1 5.0 2 Ron Howard
In [19]:
q3
Out[19]:
'with QQQ as (with QQQ as (select avg(stars) as avstars, count(stars) as nrev, director from rating natural join movie group by director having director != "") select * from QQQ where avstars=(select max(avstars) from QQQ)) select * from QQQ where nrev=(select max(nrev) from QQQ)'

Shorter than before. It's still a bit confusing because "QQQ" stands for different things in different parts, but at least there is no code duplication.

After class

By request, I am providing queries to answer some more of the questions you asked in the Google sheet.

In [23]:
# What is the most highly rated movie(s)?
# Again needs a bit of definition of what that means.
# I'll just say it's those with the highest average rating.

q = 'select mid,avg(stars) as avstars from rating group by mid'
q2 = extremizers( q, 'max', 'avstars')
q3 = 'select * from (' + q2 + ') natural join movie'
x(q3)
Out[23]:
mid avstars year title director
0 103 5.0 1999 Topsy Turvy Mike Leigh
1 106 5.0 1962 Knife in the Water Roman Polanski
2 107 5.0 1970 Dodes'ka-den Akiro Kurosawa
3 113 5.0 1998 Saving Private Ryan Steven Spielberg
4 118 5.0 1994 Léon: The Professional Luc Besson
5 136 5.0 2000 A one and a two Edward Yang
6 140 5.0 2016 Hacksaw Ridge Mel Gibson
7 144 5.0 2018 Green Book Peter Farrelly
8 151 5.0 2013 The World's End Edgar Wright
9 152 5.0 2002 Hero Zhang Yimou
10 153 5.0 2003 Memories Of Murder Bong Joon-ho
11 155 5.0 2001 A beautiful mind Ron Howard
12 156 5.0 2016 The man who knew infinity Matthew Brown
13 159 5.0 2019 The Lion King Jon Favreau
14 162 5.0 2009 Cloudy with a chance of meatballs
15 163 5.0 1993 Dave Ivan Reitman
16 164 5.0 1985 Real Genius Martha Coolidge
17 166 5.0 2009 The Time Traveler's Wife Robert Schwentke
18 167 5.0 1988 Grave of the Fireflies Isao Takahata
19 169 5.0 1994 In the Heat of the Sun Jiang Wen
20 170 5.0 2004 Howl's Moving Castle Hayao Miyazaki
21 174 5.0 2010 The King's Speech Tom Hooper
22 176 5.0 2010 Inception Christopher Nolan
23 180 5.0 2000 Crouching Tiger, Hidden Dragon Ang Lee
24 181 5.0 2018 Unfriended: Dark Web Stephen Susco
25 183 5.0 1998 The Legend of 1900 Giuseppe Tornatore
26 184 5.0 2016 Train to Busan Yeon Sang-ho
27 186 5.0 2019 The Gangster, The Cop, The Devil Won-Tae Lee
28 187 5.0 2007 Gregory Hoblit
29 189 5.0 2013 Lone Survivor Peter Berg
30 190 5.0 2001 Black Hawk Down Ridley Scott
31 191 5.0 2005 Jarhead Sam Mendes
32 195 5.0 2018 Mission Impossible - Fallout Christopher McQuarrie
33 196 5.0 2003 Johnny English Peter Howitt
34 197 5.0 1994 Andaz Apna Apna Rajkumar Santoshi
35 198 5.0 2018 Andhadhun Sriram Raghavan
36 200 5.0 2004 Troy Wolfgang Petersen
37 215 5.0 2004 The Spongebob Squarepants Movie Stephen Hillenburg
38 216 5.0 2011 Warrior Gavin O'Connor
39 220 5.0 2001 Amelie Jean-Pierre Jeunet
40 222 5.0 2004 The Chorus Christophe Barratier
In [24]:
# Is there a correlation between the time period that a movie was released in and movie rating?

# See other question about decades.
In [35]:
# Which movie has the most consistent rating (same num of stars from multiple reviewers)?

#q1 = 'select mid, min(stars) as minstars, max(stars) as maxstars from rating group by mid'
q1 = 'select mid, count(stars) as nrev, max(stars)-min(stars) as range from rating group by mid having nrev>1'
print('step 1')
display( x(q1).head() )
q2 = extremizers( q1, 'min', 'range' )
print('step 2: just those with minimum "range"')
display( x(q2).head() )
q3 = 'select title, range from (' + q2 + ') natural join movie' 
print('step 3')
x(q3)
step 1
mid nrev range
0 100 4 3
1 101 3 2
2 104 5 4
3 105 4 1
4 109 2 2
step 2: just those with minimum "range"
mid nrev range
0 112 2 0
1 113 2 0
2 114 2 0
3 139 3 0
4 155 2 0
step 3
Out[35]:
title range
0 The Terminal 0
1 Saving Private Ryan 0
2 Catch Me If You Can 0
3 21 Jump Street 0
4 A beautiful mind 0
5 The Lion King 0
6 Inception 0
In [36]:
q3
Out[36]:
'select title, range from (with QQQ as (select mid, count(stars) as nrev, max(stars)-min(stars) as range from rating group by mid having nrev>1) select * from QQQ where range=(select min(range) from QQQ)) natural join movie'
In [41]:
# Which user submitted the largest number of 1 star ratings?

q1 = 'select rid,count(stars) as n1star from rating where stars=1 group by rid'
q2 = extremizers( q1, 'max', 'n1star')
x(q2)
Out[41]:
rid n1star
0 11235 1
1 1920 1
2 1972 1
3 3333 1
4 777 1

Apparently no one gave more than one 1-star review.

In [43]:
# Which director appears the most in the database?

# I'll interpret this to mean who directed the largest number of movies listed in movie table.

q1 = 'select director,count(mid) as nmov from movie group by director having director != ""'
q2 = extremizers( q1, 'max', 'nmov' )
x(q2)
Out[43]:
director nmov
0 Christopher Nolan 4
In [49]:
# Who reviewed movies with the longest time span?
q1 = 'select firstname,lastname,max(year)-min(year) as range from rating natural join movie natural join reviewer group by rid'
print('step 1')
display( x(q1).head() )
q2 = extremizers( q1, 'max', 'range' )
print('Final result')
x(q2)
step 1
firstname lastname range
0 Usman Hussain 41
1 Gifari Hoque 16
2 Tong Wu 17
3 Dmitri Kireyev 0
4 Eyob TM 36
Final result
Out[49]:
firstname lastname range
0 Jacob Almudevar 66
1 Bernard Badzioch 66
In [67]:
# What is the proportion of movies that were rated by only one person?

q1 = 'select mid,count(mid) as nrat from rating group by mid having nrat=1'
q2 = 'select count(mid) as nby1 from (' + q1 + ')'
display( x(q2) )
q3 = 'select count(distinct mid) as nall from rating'
display( x(q3) )
# How do I combine these two results?
q4 = 'select * from (' + q2 + ') cross join (' + q3 + ')'
display( x(q4) )
q5 = 'select 100*cast(nby1 as float)/nall as "percentage rated by one person" from (' + q2 + ') cross join (' + q3 + ')'
x(q5)
nby1
0 65
nall
0 111
nby1 nall
0 65 111
Out[67]:
percentage rated by one person
0 58.558559

I don't really know if that is the standard way of doing arithmetic on numbers in SQL. Does anyone else know?

In [70]:
# How many movies were reviewed in each decade?

q0 = 'select distinct movie.mid, year from movie inner join rating'  # get the movies that were reviewed
q1 = 'select year,(year/10)*10 as decade from (' + q0 + ')' 
q2 = 'select decade,count(decade) from (' + q1 + ') group by decade'
x(q2)
Out[70]:
decade count(decade)
0 0 1
1 1940 1
2 1960 2
3 1970 3
4 1980 3
5 1990 15
6 2000 38
7 2010 56
In [ ]: