Aim is to check the trend of US viewers for the show Marvels Agents of Shield¶
- trying to check the pattern of US viewing over the three seasons
- all data is taken from wikipedia
- to do : need to improve the plotting using matplotlib
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
style.use('fivethirtyeight')
%matplotlib inline
import itertools
In [2]:
# defs
def rep(str, times):
return [ i for i in itertools.repeat(str, times) ]
def episodize(df,season):
"""df, str ->list
make a list of SeasonEpisod str
"""
x = rep(season, len(df))
return [season+'E'+str(e+1) for e,i in enumerate(x)]
In [3]:
url= "https://en.wikipedia.org/wiki/List_of_Agents_of_S.H.I.E.L.D._episodes"
In [4]:
wiki = pd.read_html(url)
In [5]:
# just required the season 1,2,3 table from the wikipedia page
# which is
s1 = wiki[1]
s2 = wiki[2]
s3 = wiki[3]
#s3.head(3)
In [6]:
# cleaning up the data frame
def season_df(df, season):
"""
returns a dataframe whith proper colnames
index is the season:epidode ids
"""
colnames = ["no_over_all","no_in_season","title","directors","writers",
"air_date","us_viewers_in_millions"]
# slicing out the rows
df = df.ix[1:]
df.columns = colnames
# renove the NaN columns if any
df.dropna(inplace=True)
air= [i.split("(")[1].split(")")[0] for i in df['air_date'].as_matrix() ]
df['air_date'] = air
usview= [ i.split("[")[0] for i in df['us_viewers_in_millions'].as_matrix()]
df['us_viewers_in_millions'] = usview
# Make another column which is a Season Indicator
# Putting the No. overall as index
df['Episodes'] = episodize(df,season)
df.set_index( "Episodes", inplace=True)
# reorder the
df2 = df[['title','us_viewers_in_millions','directors','air_date']]
return df2
In [7]:
#season_df(s3,'S3')
In [8]:
#del dfs1, dfs2, dfs3
# make three data frames for each season
dfs1=season_df(s1,'S1')
dfs2=season_df(s2,'S2')
dfs3=season_df(s3,'S3')
In [9]:
dfs1.head()
Out[9]:
In [10]:
p=dfs1['us_viewers_in_millions'].astype(float)
p.plot()
plt.show()
In [11]:
p=dfs2['us_viewers_in_millions'].astype(float)
p.plot()
plt.show()
In [12]:
p=dfs3.us_viewers_in_millions[ dfs3.us_viewers_in_millions != 'TBD' ].astype(float)
p.plot()
plt.show()