Movie database queries

In [1]:
sheets = {
'f19_reviewer.xlsx':'1KJpIeML2f2t6UxxGFzOTOQPM0xtyUgJVVBrW3I9nyjU', # except change first Z to K
'f19_movie.xlsx'   :'1rrAUcQxNTGcwZTnUEI5kZJvDGjLE7V1LvB5Rey5rY1w', # except change first Z to r
'f19_rating.xlsx'  :'108q2iAlSiq8AQUJCOVa5uj8lkjIpBIPLIr83C41amYw'} # except change first Z to q

import requests
for sheet in sheets:
                 url = 'https://docs.google.com/spreadsheets/d/' + sheets[sheet] + '/export?format=xlsx'
                 r = requests.get(url)
                 with open(sheet,'wb') as f:
                         f.write(r.content)
                 print('Downloaded '+sheet)
Downloaded f19_reviewer.xlsx
Downloaded f19_movie.xlsx
Downloaded f19_rating.xlsx
In [1]:
import pandas as pd
In [7]:
reviewer = pd.read_excel('f19_reviewer.xlsx')
rating = pd.read_excel('f19_rating.xlsx')
movie = pd.read_excel('f19_movie.xlsx')
rating.head()
Out[7]:
date rid mid stars
0 2019-10-02 777 100 3
1 2019-10-02 777 101 5
2 2019-10-02 777 102 4
3 2019-10-02 777 103 5
4 2019-10-02 3333 104 1
In [4]:
big = rating.merge(reviewer,on='rid').merge(movie,on='mid')
In [5]:
big.head()
Out[5]:
date rid mid stars firstname lastname year title director
0 2019-10-02 777 100 3 John Ringland 2014.0 Birdman Alejandro G. Iñárritu
1 2019-10-02 1437 100 4 Zoe Herrick 2014.0 Birdman Alejandro G. Iñárritu
2 2019-10-04 1920 100 1 Abhishek Mishra 2014.0 Birdman Alejandro G. Iñárritu
3 2019-10-07 235 100 4 Jacob Almudevar 2014.0 Birdman Alejandro G. Iñárritu
4 2019-10-02 777 101 5 John Ringland 2004.0 The Incredibles Brad Bird
In [6]:
pd.__version__
Out[6]:
'0.25.0'

Which director appears the most in the database?

In [10]:
#Fetch the column director, and see the value counts 
d = movie['director'].value_counts()
d
Out[10]:
Christopher Nolan            4
Steven Spielberg             3
Edgar Wright                 3
Seth Rogan, Evan Goldberg    2
Gavin O'Connor               2
                            ..
Andres Muschietti            1
Gregory Hoblit               1
Lenny Abrahamson             1
Luc Besson                   1
Andrew Stanton               1
Name: director, Length: 102, dtype: int64
In [11]:
d[ d==d.max() ]
Out[11]:
Christopher Nolan    4
Name: director, dtype: int64

Who rated Avatar?

In [13]:
(movie[movie['title']=='Avatar']
 .merge(rating,on='mid')[['rid','stars']]
 .merge(reviewer,on='rid')
)
Out[13]:
rid stars firstname lastname
0 6686 5 Eugen Feng
1 1920 2 Abhishek Mishra
2 505 2 Dorien Verbruggen
3 80085 5 Joseph Ng

What was the most common year in the movie database?

In [12]:
common = movie['year'].value_counts()

common[common==common.max()]
Out[12]:
2019.0    10
2018.0    10
Name: year, dtype: int64

How many movies were reviewed - broken down by decade?

In [13]:
import numpy as np
y = []
for item in big:
    y = np.floor(big['year']/10)
    
In [14]:
mbd = movie.copy()
mbd['decade'] = mbd['year']//10  # .map( lambda x: np.floor(x/10)
mbd.head()
Out[14]:
mid year title director decade
0 100.0 2014.0 Birdman Alejandro G. Iñárritu 201.0
1 101.0 2004.0 The Incredibles Brad Bird 200.0
2 102.0 2009.0 City Hall Shin Woo-chul 200.0
3 103.0 1999.0 Topsy Turvy Mike Leigh 199.0
4 104.0 2018.0 Avengers: EndGame Anthony RUsso 201.0
In [15]:
from collections import Counter
Counter(mbd['decade'])
Out[15]:
Counter({201.0: 56,
         200.0: 38,
         199.0: 15,
         196.0: 2,
         197.0: 3,
         194.0: 1,
         198.0: 3,
         nan: 1,
         nan: 1})
In [18]:
rm = rating[['mid']].merge(movie[['mid','year']],on='mid')   # selecting out the columns we actually need
rm.head()
Out[18]:
mid year
0 100 2014.0
1 100 2014.0
2 100 2014.0
3 100 2014.0
4 101 2004.0
In [23]:
rm['decade'] = rm['year'].map(lambda x: int((x//10)*10))
display(rm.head())
decadecount = rm['decade'].value_counts(ascending=True).to_frame()
decadecount
mid year decade
0 100 2014.0 2010
1 100 2014.0 2010
2 100 2014.0 2010
3 100 2014.0 2010
4 101 2004.0 2000
Out[23]:
decade
1960 1
1940 2
1970 3
1980 3
1990 25
2000 48
2010 95

I think this is incorrect. It is counting how many ratings there are by movie decade.

How about dropping dups from rating.

In [15]:
mry = (rating['mid']
       .drop_duplicates()
       .to_frame()
       .merge(movie[['mid','year']])
      )
mry.head()
Out[15]:
mid year
0 100 2014.0
1 101 2004.0
2 102 2009.0
3 103 1999.0
4 104 2018.0
In [29]:
mry['decade'] = mry['year'].map(lambda x: int((x//10)*10))
display(mry.head())
decadecount = mry['decade'].value_counts(ascending=True).to_frame()
decadecount.rename({'decade':'count'},axis=1)
mid year decade
0 100 2014.0 2010
1 101 2004.0 2000
2 102 2009.0 2000
3 103 1999.0 1990
4 104 2018.0 2010
Out[29]:
count
1940 1
1960 1
1970 2
1980 3
1990 15
2000 34
2010 53
In [30]:
movie[ (movie['year']>=2000) & (movie['year']<=2005) ]
Out[30]:
mid year title director
1 101.0 2004.0 The Incredibles Brad Bird
11 112.0 2004.0 The Terminal Steven Spielberg
13 114.0 2002.0 Catch Me If You Can Steven Spielberg
18 119.0 2005.0 Serenity NaN
35 136.0 2000.0 A one and a two Edward Yang
49 150.0 2004.0 Shaun of the Dead Edgar Wright
51 152.0 2002.0 Hero Zhang Yimou
52 153.0 2003.0 Memories Of Murder Bong Joon-ho
54 155.0 2001.0 A beautiful mind Ron Howard
69 170.0 2004.0 Howl's Moving Castle Hayao Miyazaki
79 180.0 2000.0 Crouching Tiger, Hidden Dragon Ang Lee
89 190.0 2001.0 Black Hawk Down Ridley Scott
90 191.0 2005.0 Jarhead Sam Mendes
95 196.0 2003.0 Johnny English Peter Howitt
99 200.0 2004.0 Troy Wolfgang Petersen
108 213.0 2001.0 Rat Race Andy Breckman
110 215.0 2004.0 The Spongebob Squarepants Movie Stephen Hillenburg
115 220.0 2001.0 Amelie Jean-Pierre Jeunet
116 221.0 2002.0 Chicago Rob Marshall
117 222.0 2004.0 The Chorus Christophe Barratier

Which rated movie had the longest title?

In [31]:
movie.head()
Out[31]:
mid year title director
0 100.0 2014.0 Birdman Alejandro G. Iñárritu
1 101.0 2004.0 The Incredibles Brad Bird
2 102.0 2009.0 City Hall Shin Woo-chul
3 103.0 1999.0 Topsy Turvy Mike Leigh
4 104.0 2018.0 Avengers: EndGame Anthony RUsso
In [32]:
movie['title']=movie['title'].astype('str')
movie['len']=movie['title'].map(len)
index = movie['len'].idxmax()
long_title = movie.loc[index]['title']
display(long_title)
'Along with the Gods: The Two Worlds'
In [40]:
movie[movie['len']==movie['len'].max()]#.iloc[0]
Out[40]:
mid year title director len
84 185.0 2017.0 Along with the Gods: The Two Worlds Yong-hwa Kim 35

Which director has the highest review scores in our list?

In [50]:
rm = rating.merge(movie,on='mid')
g = rm.groupby('director')
directors = g.agg({'stars' : ['mean','count']})
directors.columns = directors.columns.get_level_values(1) + ' ' + directors.columns.get_level_values(0)
directors = directors[directors['mean stars'] == directors['mean stars'].max()]
directors.sort_values('count stars',ascending=False)
# still want to pull out the directors who attain the max
Out[50]:
mean stars count stars
director
Jon Favreau 5.0 2
Ron Howard 5.0 2
Akiro Kurosawa 5.0 1
Mel Gibson 5.0 1
Peter Berg 5.0 1
Peter Farrelly 5.0 1
Rajkumar Santoshi 5.0 1
Robert Schwentke 5.0 1
Roman Polanski 5.0 1
Sam Mendes 5.0 1
Sriram Raghavan 5.0 1
Stephen Hillenburg 5.0 1
Stephen Susco 5.0 1
Tom Hooper 5.0 1
Wolfgang Petersen 5.0 1
Won-Tae Lee 5.0 1
Yeon Sang-ho 5.0 1
Mike Leigh 5.0 1
Matthew Brown 5.0 1
Ang Lee 5.0 1
Martha Coolidge 5.0 1
Luc Besson 5.0 1
Jiang Wen 5.0 1
Jean-Pierre Jeunet 5.0 1
Ivan Reitman 5.0 1
Isao Takahata 5.0 1
Hayao Miyazaki 5.0 1
Gregory Hoblit 5.0 1
Giuseppe Tornatore 5.0 1
Gavin O'Connor 5.0 1
Edward Yang 5.0 1
Christopher McQuarrie 5.0 1
Christophe Barratier 5.0 1
Bong Joon-ho 5.0 1
Zhang Yimou 5.0 1

Who reviewed movies over the longest time span?

rating.head()
In [55]:
mylist = rating.merge(movie, on = 'mid').merge(movie[[ 'year']])
mylist = mylist.iloc[:,:5]
mylist.head()
mylist.columns
Out[55]:
Index(['date', 'rid', 'mid', 'stars', 'year'], dtype='object')
In [57]:
mylist0 = mylist.drop(columns = ['date', 'mid', 'stars'])
mylist0
Out[57]:
rid year
0 777 2014.0
1 777 2014.0
2 777 2014.0
3 777 2014.0
4 777 2014.0
... ... ...
1013 6666666 2011.0
1014 515 2011.0
1015 515 2011.0
1016 515 2011.0
1017 455 1986.0

1018 rows × 2 columns

In [59]:
rating[['rid','mid']].merge(movie[['mid','year']],on='mid')[['rid','year']]
Out[59]:
rid year
0 777 2014.0
1 1437 2014.0
2 1920 2014.0
3 235 2014.0
4 777 2004.0
... ... ...
172 102 2002.0
173 102 2004.0
174 102 2017.0
175 102 2018.0
176 80085 2018.0

177 rows × 2 columns

In [62]:
mylist00 = (mylist0.groupby('rid').agg([('Min', 'min'),('Max', 'max')]))
mylist00.head()
mylist00.columns
Out[62]:
MultiIndex([('year', 'Min'),
            ('year', 'Max')],
           )
In [66]:
mylist00['time span'] = mylist00[('year', 'Max')] - mylist00[('year', 'Min')]
mylist00.head()
Out[66]:
year time span
Min Max
rid
0 1978.0 2019.0 41.0
1 1998.0 2014.0 16.0
5 2002.0 2015.0 13.0
6 1998.0 2019.0 21.0
7 1948.0 2014.0 66.0
In [69]:
mylist00[mylist00['time span'] == mylist00['time span'].max()].merge(reviewer, on = 'rid')
/usr/local/lib/python3.6/dist-packages/pandas/core/reshape/merge.py:617: UserWarning: merging between different levels can give an unintended result (2 levels on the left, 1 on the right)
  warnings.warn(msg, UserWarning)
Out[69]:
rid (year, Min) (year, Max) (time span, ) firstname lastname
0 7 1948.0 2014.0 66.0 Bernard Badzioch
1 235 1948.0 2014.0 66.0 Jacob Almudevar

Added after class

In [78]:
rmm = (rating[['rid','mid']]
 .merge(movie[['mid','year']],on='mid')
 .groupby('rid')
 .agg({'year':['min','max']})
)
rmm.head()
Out[78]:
year
min max
rid
0 1978.0 2019.0
1 1998.0 2014.0
5 2002.0 2015.0
6 1998.0 2019.0
7 1948.0 2014.0
In [81]:
rmm['span'] = rmm[('year','max')] - rmm[('year','min')]
rmm.head()
rmm[ rmm['span']==rmm['span'].max() ].merge(reviewer,on='rid')
Out[81]:
rid (year, min) (year, max) (span, ) firstname lastname
0 7 1948.0 2014.0 66.0 Bernard Badzioch
1 235 1948.0 2014.0 66.0 Jacob Almudevar

More to come ...

What is the most highly rated movie?

Ji Ho Lee.

Modifying the original tables will mess them up for other people's queries.

In [3]:
#rating = rating.groupby('mid').agg({'stars':'mean'}).merge( movie, on = 'mid')
#rating[rating['stars']==rating['stars'].max()][['stars','title']]

rating1 = rating.groupby('mid').agg({'stars':'mean'}).merge( movie, on = 'mid')
rating1[rating1['stars']==rating1['stars'].max()][['stars','title']]
Out[3]:
stars title
3 5.0 Topsy Turvy
6 5.0 Knife in the Water
7 5.0 Dodes'ka-den
12 5.0 Saving Private Ryan
16 5.0 Léon: The Professional
33 5.0 A one and a two
37 5.0 Hacksaw Ridge
41 5.0 Green Book
48 5.0 The World's End
49 5.0 Hero
50 5.0 Memories Of Murder
52 5.0 A beautiful mind
53 5.0 The man who knew infinity
56 5.0 The Lion King
59 5.0 Cloudy with a chance of meatballs
60 5.0 Dave
61 5.0 Real Genius
63 5.0 The Time Traveler's Wife
64 5.0 Grave of the Fireflies
66 5.0 In the Heat of the Sun
67 5.0 Howl's Moving Castle
71 5.0 The King's Speech
73 5.0 Inception
77 5.0 Crouching Tiger, Hidden Dragon
78 5.0 Unfriended: Dark Web
80 5.0 The Legend of 1900
81 5.0 Train to Busan
83 5.0 The Gangster, The Cop, The Devil
84 5.0 NaN
86 5.0 Lone Survivor
87 5.0 Black Hawk Down
88 5.0 Jarhead
92 5.0 Mission Impossible - Fallout
93 5.0 Johnny English
94 5.0 Andaz Apna Apna
95 5.0 Andhadhun
97 5.0 Troy
102 5.0 The Spongebob Squarepants Movie
103 5.0 Warrior
104 5.0 Amelie
106 5.0 The Chorus

The following is NOT approved: do not iterate over rows in python unless absolutely unavoidable.

Eugen:

In [8]:
low = []
data = rating.merge(movie,on='mid').merge(reviewer,on='rid')
data['Name'] = data['firstname'] + ' ' + data['lastname']
#data = data.drop(['director','date','title','year','rid','lastname','firstname'],axis=1)

for i in range(0,len(data)):
    if data['stars'].iloc[i] == 1:
        low.append(data.iloc[i])
low = pd.DataFrame(low)
low['Name'].mode()
Out[8]:
0    Abhishek Mishra
1       Betsy McCall
2       Chima Ezeilo
3            Eyob TM
4      John Ringland
dtype: object

How many people actually provided a review?

Gifari

In [9]:
len(set(reviewer['rid']))
Out[9]:
34

The above is not correct, because it includes everyone in the reviewer table, even if they rated no movies.

Which is the least liked movie?

The Fantastic Three

In [10]:
t = (rating.groupby('mid')
    .agg({'stars':'mean'})
    .sort_values('stars',ascending=True)
    .reset_index()
    .rename(columns={'stars':'average ratings'})
    .merge(movie[['mid','title']],on='mid')
    )
t[ t['average ratings']== t['average ratings'].min() ] 
Out[10]:
mid average ratings title
0 168 2.0 Sherlock Holmes
1 182 2.0 High School Musical 3: Senior Year

Who rated Avatar?

jrong (Also see the way it was done differently above.)

In [12]:
mid = movie[movie['title']=='Avatar']['mid']
rid = rating[rating['mid'] == float(mid)]['rid']
rev = reviewer[reviewer['rid'].isin(rid)]
rev
Out[12]:
rid firstname lastname
20 505 Dorien Verbruggen
22 1920 Abhishek Mishra
25 80085 Joseph Ng
30 6686 Eugen Feng

How are 21st-century movies compare to 20th-century movies on average rating?

Darsh

In [14]:
data = rating.merge(movie,on='mid')[['year','stars']]
# data.head()

rating_20 = data[(data['year']<2000)]
x=rating_20['stars'].mean()
print('The average rating of 20th century movies is {}'.format(round(x,2)))

rating_21 = data[(data['year']>=2000)]
y=rating_21['stars'].mean()
print('The average rating of 21st century movies is {}'.format(round(y,2)))
The average rating of 20th century movies is 4.41
The average rating of 21st century movies is 4.1

Average ratings by year

Modified from Jeremy's

In [27]:
import altair as alt
alt.renderers.enable('notebook')
alt.Chart(rating
 .merge(movie[['mid','year']],on='mid')
 .groupby('year')
 .agg({'stars':'mean'})
 .reset_index(),
          height=100,width=600
).mark_bar().encode(x='year',y='stars')
Out[27]:

In [ ]: