Project - MovieLens Data Analysis¶The GroupLens Research Project is a research group in the Department of Computer Science and Engineering at the University of Minnesota. The data is widely used for collaborative filtering and other filtering solutions. However, we will be using this data to act as a means to demonstrate our skill in using Python to “play” with data.
Objective:¶Learning Outcomes:¶Exploratory Data Analysis
Visualization using Python
Pandas – groupby, merging
Domain¶Note that the project will need you to apply the concepts of groupby and merging extensively.
Datasets Information:¶rating.csv: It contains information on ratings given by the users to a particular movie.
movie.csv: File contains information related to the movies and their genre.
user.csv: It contains information of the users who have rated the movies.
Please provide you insights wherever necessary.
#Importing Numpy, Pandas and Seaborn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as mplt
#Reading the CSV files into Jupyter Notebook.
user = pd.read_csv('user.csv')
item = pd.read_csv('item.csv')
data = pd.read_csv('Data.csv')
#Finding a basic understanding of the datasets: its contents, shape, descriptive statistics, null values, etc.
print('\033[1m' + 'User:')
print('\033[0m')
print(user.info) #User dataframe
print(user.shape) #User dataframe number of rows and columns
print(user.describe()) #Descriptive Statistics for user
print(user.isnull().sum().sum()) #Number of missing observations for user
print('')
print('')
print('Item') #Item dataframe.
print(item.info)
print(item.shape)
print(item.describe())
print(item.isnull().sum().sum())
print('')
print('')
print('Data') #Data dataframe
print(data.info)
print(data.shape)
print(data.describe())
print(data.isnull().sum().sum())
item.sum() #Number of movies by genre
#Find row where unknown does not equal to 0.
print(item[(item['unknown']!= 0)])
#Copy original dataframe. Drop Row Index# 1371 from new dataframe
item2 = item.copy()
item2 = item2.drop(item.index[1371]) #Delete row with index number 1371 from new dataframe
item2.iloc[1371] #Row is deleted from new original frame because the movie id is 1374 instead of 1373
hint: use sum on the axis = 1
Display movie name, number of genres for the movie in dataframe
and also print(total number of movies which have more than one genres)
#Create new column('genre_total')that adds up genre columns, which are the last 19 columns
item2['genre_total']= item2.iloc[:, -19:-1].sum(axis=1)
print(item2.head()) #Check that we added all the genres properly
#List of movie names, number of genres
item3 = item2[['movie title', 'genre_total']]
print(item3)
#Subset dataframe and create new dataframe (item_multiple_genres) where genre_total > 1.
item_multiple_genres = item3[item3['genre_total'] > 1] # Creating new dataframe based off of condition genre_total > 1
item_multiple_genres.head()
item_multiple_genres['genre_total'].value_counts().sum() # Count the number of movies that have more than 1 genre and calculate the sum.
HINT: Use distplot for age. Use lineplot or countplot for release year.
HINT: Plot percentages in y-axis and categories in x-axis for ratings, gender and occupation
HINT: Please refer to the below snippet to understand how to get to release year from release date. You can use str.split() as depicted below or you could convert it to pandas datetime format and extract year (.dt.year)
a = 'My*cat*is*brown'
print(a.split('*')[3])
#similarly, the release year needs to be taken out from release date
#also you can simply slice existing string to get the desired data, if we want to take out the colour of the cat
print(a[10:])
print(a[-5:])
#AGE
print('Age') #AGE
sns.distplot(user['age']);
#RELEASE YEAR
print('Release Year')
item['release_year_string'] = pd.Series(item['release date'], dtype="string") #Change release date from Series object to string
print(item.head())
print(item.info()) #Check to see if feature is a string
def releaseyr(release_year_string):
return release_year_string.split('-')[2] #Because series object doesn't have split, you have to use apply()
item['release_year'] = item['release_year_string'].apply(releaseyr) #Get year from function
print(item.head())
print(item.tail())
mplt.figure(figsize=(15,10)) #Countplot for Release Year
chart = sns.countplot(item['release_year'])
chart.set_xticklabels(chart.get_xticklabels(), rotation=90)
#RATING
rating = pd.DataFrame(data['rating'].value_counts())
total_ratings = data['rating'].value_counts().sum() #Find frequency of ratings and make new dataframe
rating.head(6)
rating['percent'] = rating['rating']/total_ratings*100 #Calculating percentage of ratings
rating['percent'] = round(rating['percent'], 2)
rating.head(6)
rating.reset_index(inplace=True)
rating.head(6)
rating.columns
rating.columns = ['Rating', 'Frequency', 'Percentage (%)'] #Naming table columns
print(rating.head())
## Making lollipop chart in Matplotlib to show percentage of ratings.
(markerline, stemlines, baseline) = mplt.stem(rating['Rating'],
rating['Percentage (%)'])
mplt.setp(markerline, marker='.', markersize=15,
markeredgewidth=2, color='red')
mplt.setp(stemlines, color='red')
mplt.setp(baseline, visible=False)
mplt.tick_params(labelsize=12)
mplt.xlabel('Rating', size=12)
mplt.ylabel('Percentage (%)', size=12)
mplt.xticks([1, 2, 3, 4, 5])
mplt.ylim(bottom=0)
mplt.show()
#GENDER
gender = pd.DataFrame(user['gender'].value_counts())
total = user['gender'].value_counts().sum()
gender['percent'] = gender['gender']/total*100
gender['percent'] = round(gender['percent'], 2)
gender.head()
gender.reset_index(inplace=True)
gender.head(6)
gender.columns
gender.columns = ['Gender', 'Frequency', 'Percentage (%)']
print(gender.head())
(markerline, stemlines, baseline) = mplt.stem(gender['Gender'],
gender['Percentage (%)'])
mplt.setp(markerline, marker='x', markersize=15,
markeredgewidth=2, color='blue')
mplt.setp(stemlines, color='blue')
mplt.setp(baseline, visible=False)
mplt.tick_params(labelsize=12)
mplt.xlabel('Gender', size=12)
mplt.ylabel('Percentage (%)', size=12)
mplt.ylim(bottom=0)
mplt.show()
##OCCUPATION
occupation = pd.DataFrame(user['occupation'].value_counts())
total_occup = user['occupation'].value_counts().sum()
occupation['percent'] = occupation['occupation']/total_occup*100
occupation['percent'] = round(occupation['percent'], 2)
occupation.head()
occupation.reset_index(inplace=True)
occupation.head(6)
occupation.columns
occupation.columns = ['Occupation', 'Frequency', 'Percentage (%)']
print(occupation)
(markerline, stemlines, baseline) = mplt.stem(occupation['Occupation'],
occupation['Percentage (%)'])
mplt.setp(markerline, marker='*', markersize=15,
markeredgewidth=2, color='green')
mplt.setp(stemlines, color='green')
mplt.setp(baseline, visible=False)
mplt.tick_params(labelsize=12)
mplt.xlabel('Occupation', size=12)
mplt.ylabel('Percentage (%)', size=12)
mplt.xticks(rotation=90)
mplt.ylim(bottom=0)
mplt.show()
Note that you need to use the percent of number of releases in a year as a parameter of popularity of a genre
Hint 1: You need to reach to a data frame where the release year is the index and the genre is the column names (one cell shows the number of release in a year in one genre) or vice versa. (Drop unnecessary column if there are any)
Hint 2: Find the total number of movies release in a year(use sum(axis=1) store that value in a new column as 'total'). Now divide the value of each genre in that year by total to get percentage number of release in a particular year.
(df.div(df['total'], axis= 0) * 100)
Once that is achieved, you can either use univariate plots or can use the heatmap to visualise all the changes over the years in one go.
Hint 3: Use groupby on the relevant column and use sum() on the same to find out the number of releases in a year/genre.
item.head()
item_year_action = item.groupby(['release_year'])[['unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']].sum().reset_index()
item_year_action.tail() #Using groupby function, and sum()
item_year_action['total'] = item_year_action.sum(axis=1) #Sum the genre columns and place result in 'total' column
item_year_action.tail()
item_year_action2 = item_year_action.set_index('release_year') #Make release_year the index
item_year_action2.info
item_year_action2.dtypes
movie_release_per_year_percent = item_year_action2.iloc[:,1:19].div(item_year_action2.total, axis=0)*100 #Get percentage of each genre in each year
print(movie_release_per_year_percent)
fig, ax = mplt.subplots(figsize=(15,10)) #Setting dimensions of heatmap
sns.heatmap(movie_release_per_year_percent, cmap = "cividis"); #Change color of heatmap
Hints :
Note: This question will need you to research about groupby and apply your findings. You can find more on groupby on https://realpython.com/pandas-groupby/.
data.head()
item.head()
movie_rating = pd.merge(item,data,how='left',on='movie id') #Left merge to retain the movie list from the item dataset
movie_rating.head()
movie_rating2 = movie_rating.groupby(['movie title'])[['rating']].mean().reset_index() #Average Ratings
movie_rating2.tail()
movie_rating.head()
movie_rating3 = movie_rating.groupby(['movie title'])[['rating']].sum().reset_index() #Count of Ratings
movie_rating3.tail()
movie_rating_over_100 = movie_rating3[(movie_rating3['rating'] > 100)] #Subsetting movies with more than 100 ratings
movie_rating_over_100.tail()
movie_rating_over_100_avg_rating = pd.merge(movie_rating_over_100, movie_rating2, how = 'left', on='movie title') # Making frequency table to show frequency and average rating
movie_rating_over_100_avg_rating.head()
movie_rating_over_100_avg_rating.columns = ['movie title', 'Rating Count', 'Rating Average'] #Change the column titles.
print(movie_rating_over_100_avg_rating.sort_values(['Rating Average'], #Display table
ascending=False).head(25))
movie_rating_over_100_avg_rating.sort_values(['Rating Average'], #Creating bar chart of top 25 films with oveer 100 ratings
ascending=False).head(25).plot(x='movie title',y='Rating Average',kind='bar', figsize=(15,5))
mplt.show()
compare the percentages
Merge all the datasets
There is no need to conduct statistical tests around this. Just compare the percentages and comment on the validity of the above statements.
you might want ot use the .sum(), .div() function here.
Use number of ratings to validate the numbers. For example, if out of 4000 ratings received by women, 3000 are for drama, we will assume that 75% of the women watch drama.
#Merge all datasets
movie = pd.merge(item, data, how = 'outer', on = 'movie id') #Merge item and data datasets
movie_user = pd.merge(movie, user, how = 'outer', on = 'user id') #Merge previous merged dataset with user dataset
movie_user.head()
#Verifying if men watch more drama than women.
gender_drama = movie_user.groupby(by=['gender'])['Drama'].sum().reset_index().sort_values(['Drama']).tail(10)
gender_total_drama = gender_drama['Drama'].sum()
gender_drama['Percent'] = round(gender_drama['Drama']/gender_total_drama * 100, 2)
print('Question 2 : Percentage of males and females who watch drama.')
print(gender_drama)
#Number of Rankings by Gender
rating_no_gender = movie_user.groupby(by=['gender'])['rating'].sum().reset_index().sort_values(['rating']).tail(10)
print('Number of rankings by Gender')
print(rating_no_gender)
#Checking to see the percentage of drama viewers amongst male and female raters
gender_drama['Percent_rankings'] = round(gender_drama['Drama']/rating_no_gender['rating'] * 100, 2) #Getting the percentage of each gender who watch drama from all the ratings from that gender
print('Question 4: Percentage of drama viewers amongs male and female rating entries')
print(gender_drama)
#Verifying if women watch more sci-fi than men
gender_scifi = movie_user.groupby(by=['gender'])['Sci-Fi'].sum().reset_index().sort_values(['Sci-Fi']).tail(10)
gender_total_scifi = gender_scifi['Sci-Fi'].sum()
gender_scifi['Percent'] = round(gender_scifi['Sci-Fi']/gender_total_scifi * 100, 2)
print('Question 2 : Percentage of males and females who watch sci-fi.')
print(gender_scifi)
#Checking to see the percentage of sci-fi viewers amongst male and female raters
gender_scifi['Percent_rankings'] = round(gender_scifi['Sci-Fi']/rating_no_gender['rating'] * 100, 2) #Getting the percentage of each gender who watch sci-fi from all the ratings from that gender
print('Question 4: Percentage of sci-fi viewers amongs male and female rating entries')
print(gender_scifi)
#Verifying if men watch more romance than women
gender_romance = movie_user.groupby(by=['gender'])['Romance'].sum().reset_index().sort_values(['Romance']).tail(10)
gender_total_romance = gender_romance['Romance'].sum()
gender_romance['Percent'] = round(gender_romance['Romance']/gender_total_romance * 100, 2)
print('Question 2 : Percentage of males and females who watch romance.')
print(gender_romance)
#Checking to see the percentage of romance viewers amongst male and female raters
gender_romance['Percent_rankings'] = round(gender_romance['Romance']/rating_no_gender['rating'] * 100, 2) #Getting the percentage of each gender who watch romance from all the ratings from that gender
print('Question 4: Percentage of romance viewers amongs male and female rating entries')
print(gender_romance)