# %load get_rotten.py __author__ = 'ju' import json import pandas as pd import numpy as np import requests from urllib2 import Request , urlopen , URLError from bokeh.plotting import figure , output_file , show , ColumnDataSource from bokeh.models import HoverTool , BoxSelectTool import bokeh output_notebook () # df = pd.read_csv('IMDB_7000.txt',sep='\t') # df = df[df['Votes']>=50000] # print df # # #print '%'.join(['The','Shawshank']) # imdb = [] # rt = [] # rated = [] # year = [] # genre = [] # # for i in df.index: # url = 'http://www.omdbapi.com/?t='+'%25'.join(df.loc[i,'Title'].split(' '))+'&y='+str(int(df.loc[i,'Year']))+'&tomatoes=true' # result = json.load(urlopen(url)) # print df.loc[i,'Title'] # if 'imdbRating' in result: # imdb.append(result['imdbRating']) # else: # imdb.append(np.nan) # if 'tomatoMeter' in result: # rt.append(result['tomatoMeter']) # else: rt.append(np.nan) # if 'Year' in result: # year.append(result['Year']) # else: year.append(np.nan) # if 'Genre' in result: # genre.append(result['Genre']) # else: genre.append(np.nan) # if 'Rated' in result: # rated.append(result['Rated']) # else: rated.append(np.nan) # df['Year'] = pd.Series(year,index=df.index) # df['Genre'] = pd.Series(genre,index=df.index) # df['Rated'] = pd.Series(rated,index=df.index) # df['IMDB Rating'] = pd.Series(imdb,index=df.index) # df['Rotten Tomatoes'] = pd.Series(rt, index=df.index) # # df=df.dropna() # print df # df.to_csv('IMDB_RT_some.txt',sep='\t',index=False) df = pd . read_csv ( 'IMDB_RT_some.txt' , sep = ' \t ' ) #df=df.drop('Unnamed: 0',axis=1) df = df . dropna () allgen = [] for i in df [ 'Genre' ]: allgen = allgen + i . split ( ', ' ) #print set(allgen) #print set(df['Rated']) output_file ( "toolbar.html" ) source = ColumnDataSource ( data = dict ( x = df [ 'Rank' ] . tolist (), y = df [ 'Rotten Tomatoes' ] . tolist (), desc = df [ 'Title' ] . tolist (), ) ) hover = HoverTool ( tooltips = [ ( "Title" , "@desc" ), ( "(x,y)" , "($x, $y)" ), ] ) hover = HoverTool ( tooltips = [ #("index", "$index"), ( "Title" , "@desc" ), ( 'IMDB Rating' , '@ra' ), ( 'Rotten Tomatoes' , '@rt' ) #("(x,y)", "($x, $y)"), ] ) r = df [ df [ 'Rated' ] == 'R' ] rsource = ColumnDataSource ( data = dict ( ra = [ str ( i ) for i in r [ 'Rank' ] . tolist ()], rt = [ str ( int ( i )) for i in r [ 'Rotten Tomatoes' ] . tolist ()], desc = r [ 'Title' ] . tolist (), ) ) pg13 = df [ df [ 'Rated' ] == 'PG-13' ] pg13source = ColumnDataSource ( data = dict ( ra = [ str ( i ) for i in pg13 [ 'Rank' ] . tolist ()], rt = [ str ( int ( i )) for i in pg13 [ 'Rotten Tomatoes' ] . tolist ()], desc = pg13 [ 'Title' ] . tolist (), ) ) pg = df [ df [ 'Rated' ] == 'PG' ] pgsource = ColumnDataSource ( data = dict ( ra = [ str ( i ) for i in pg [ 'Rank' ] . tolist ()], rt = [ str ( int ( i )) for i in pg [ 'Rotten Tomatoes' ] . tolist ()], desc = pg [ 'Title' ] . tolist (), ) ) g = df [ df [ 'Rated' ] == 'G' ] gsource = ColumnDataSource ( data = dict ( ra = [ str ( i ) for i in g [ 'Rank' ] . tolist ()], rt = [ str ( int ( i )) for i in g [ 'Rotten Tomatoes' ] . tolist ()], desc = g [ 'Title' ] . tolist (), ) ) nc17 = df [ df [ 'Rated' ] == 'NC-17' ] nc17source = ColumnDataSource ( data = dict ( ra = [ str ( i ) for i in nc17 [ 'Rank' ] . tolist ()], rt = [ str ( int ( i )) for i in nc17 [ 'Rotten Tomatoes' ] . tolist ()], desc = nc17 [ 'Title' ] . tolist (), ) ) other = df [ df [ 'Rated' ] . isin ([ 'GP' , 'M' , 'NOT RATED' , 'PASSED' , 'X' , 'UNRATED' , 'APPROVED' ])] othersource = ColumnDataSource ( data = dict ( ra = [ str ( i ) for i in other [ 'Rank' ] . tolist ()], rt = [ str ( int ( i )) for i in other [ 'Rotten Tomatoes' ] . tolist ()], desc = other [ 'Title' ] . tolist (), ) ) p = figure ( title = 'IMDB vs. RT' , tools = [ hover ], x_axis_label = 'IMDB Rating' , y_axis_label = 'Rotten Tomatoes Rating' ) #p.scatter(df['Rank'].tolist(), df['Rotten Tomatoes'], size=5, source=source) p . scatter ( nc17 [ 'Rank' ] . tolist (), nc17 [ 'Rotten Tomatoes' ] . tolist (), size = 5 , source = nc17source , color = 'orange' , legend = 'NC-17' ) p . scatter ( r [ 'Rank' ] . tolist (), r [ 'Rotten Tomatoes' ] . tolist (), size = 5 , source = rsource , color = 'red' , legend = 'R' ) p . scatter ( pg13 [ 'Rank' ] . tolist (), pg13 [ 'Rotten Tomatoes' ] . tolist (), size = 5 , source = pg13source , color = 'blue' , legend = 'PG-13' ) p . scatter ( pg [ 'Rank' ] . tolist (), pg [ 'Rotten Tomatoes' ] . tolist (), size = 5 , source = pgsource , color = 'green' , legend = 'PG' ) p . scatter ( g [ 'Rank' ] . tolist (), g [ 'Rotten Tomatoes' ] . tolist (), size = 5 , source = gsource , color = '#32cd32' , legend = 'G' ) p . scatter ( other [ 'Rank' ] . tolist (), other [ 'Rotten Tomatoes' ] . tolist (), source = othersource , size = 5 , color = 'grey' , legend = 'other' ) p . legend . orientation = 'bottom_right' show ( p )