#Python/Pandas script to analyze baby names database 1880-2012 #from the U.S. Social Security Administration # #By David Taylor Feb. 2014 # #www.prooffreader.com (for blogged results) # #prooffreaderplus.blogspot.com (for scripts, calculations, links, gits, etc.) # #Disclaimer: not a professional programmer, more interested right now in results in a reasonable time. # That said, constructive critique and suggestions are always totally welcome. I'm not proud. # In particular, there is a lot of very needless duplication of boy and girl databases and code # in loops that refer to them, when I could just subset a larger database every time, # but I have enough memory in my computer and not enough in my head so I just found it easier # to work in this inefficient fashion. Your mileage may vary. # Note that I usually use column names instead of indexes, my brain just deals with them better # right now, as I get more used to pandas I'm already starting to adapt. # Also, sometimes I just print-dump everything to a csv file and work with it in Excel. Sorry! #Instructions: #1. Download data set at (as of Feb. 2014) http://www.ssa.gov/OACT/babynames/names.zip #2. Unzip into a working directory #3. Change the working directory strings in this script. import pandas as pd import numpy as np import matplotlib.pyplot as plt import re as re import scipy import os os . chdir ( "C:/_Dropbox/Dropbox/py/babynames/yobs" ) #change this to your working directory #read yob files, arrange into data frames and concatenate into one data frame #this portion taken from O'Reilly's Python for Data Analysis (2009) years = range ( 1880 , 2013 ) #remember in python a 2013 upper bound means the last value used will be 2012 pieces = [] yobcolumns = [ 'name' , 'sex' , 'births' ] for year in years : path = 'yob %d .txt' % year frame = pd . read_csv ( path , names = yobcolumns ) frame [ 'year' ] = year pieces . append ( frame ) df = pd . concat ( pieces , ignore_index = True ) os . chdir ( "C:/_Dropbox/Dropbox/py/babynames/" ) #change this to your working directory #add column 'pct' that is the number of births of that name and sex in that year #divided by the total number of births of that sex in that year, multiplied by #100 to turn into a percentage and reduce all those leading zeroes def add_pct ( group ): births = group . births . astype ( float ) group [ 'pct' ] = ( births / births . sum () * 100 ) return group df = df . groupby ([ 'year' , 'sex' ]) . apply ( add_pct ) #add rank of each name each year each sex df [ 'ranked' ] = df . groupby ([ 'year' , 'sex' ])[ 'births' ] . rank ( ascending = False ) #subset girls and boys dff = df [ df . sex == 'F' ] dfm = df [ df . sex == 'M' ] #create names dataframe. This DF discards individual birth or pct values, and instead collects data on unique names. #There is one row per unique combination of name and sex. temp_count = pd . DataFrame ( data = dff [ 'name' ] . value_counts (), columns = [ 'year_count' ]) temp_min = pd . DataFrame ( data = dff . groupby ( 'name' ) . year . min (), columns = [ 'year_min' ]) temp_max = pd . DataFrame ( data = dff . groupby ( 'name' ) . year . max (), columns = [ 'year_max' ]) temp_pctsum = pd . DataFrame ( data = dff . groupby ( 'name' ) . pct . sum (), columns = [ 'pct_sum' ]) temp_pctmax = pd . DataFrame ( data = dff . groupby ( 'name' ) . pct . max (), columns = [ 'pct_max' ]) temp_f = temp_count . join ( temp_min ) temp_f = temp_f . join ( temp_max ) temp_f = temp_f . join ( temp_pctsum ) temp_f = temp_f . join ( temp_pctmax ) temp_f [ 'sex' ] = "F" temp_f . reset_index ( inplace = True , drop = False ) temp_f . columns = [ 'name' , 'year_count' , 'year_min' , 'year_max' , 'pct_sum' , 'pct_max' , 'sex' ] temp_f = temp_f [[ 'name' , 'sex' , 'year_count' , 'year_min' , 'year_max' , 'pct_sum' , 'pct_max' ]] temp_count = pd . DataFrame ( data = dfm [ 'name' ] . value_counts (), columns = [ 'year_count' ]) temp_min = pd . DataFrame ( data = dfm . groupby ( 'name' ) . year . min (), columns = [ 'year_min' ]) temp_max = pd . DataFrame ( data = dfm . groupby ( 'name' ) . year . max (), columns = [ 'year_max' ]) temp_pctsum = pd . DataFrame ( data = dfm . groupby ( 'name' ) . pct . sum (), columns = [ 'pct_sum' ]) temp_pctmax = pd . DataFrame ( data = dfm . groupby ( 'name' ) . pct . max (), columns = [ 'pct_max' ]) temp_m = temp_count . join ( temp_min ) temp_m = temp_m . join ( temp_max ) temp_m = temp_m . join ( temp_pctsum ) temp_m = temp_m . join ( temp_pctmax ) temp_m [ 'sex' ] = "M" temp_m . reset_index ( inplace = True , drop = False ) temp_m . columns = [ 'name' , 'year_count' , 'year_min' , 'year_max' , 'pct_sum' , 'pct_max' , 'sex' ] temp_m = temp_m [[ 'name' , 'sex' , 'year_count' , 'year_min' , 'year_max' , 'pct_sum' , 'pct_max' ]] names = pd . concat ([ temp_f , temp_m ], ignore_index = True ) # create years dataframe. This DF discards individual name data, aggregating by year. total = pd . DataFrame ( df . pivot_table ( 'births' , rows = 'year' , cols = 'sex' , aggfunc = sum )) total . reset_index ( drop = False , inplace = True ) total . columns = [ 'year' , 'births_f' , 'births_m' ] total [ 'births_t' ] = total . births_f + total . births_m newnames = pd . DataFrame ( data = names . groupby ( 'year_min' ) . year_min . count (), columns = [ 'firstyearcount' ]) newnames . reset_index ( drop = False , inplace = True ) newnames . columns = [ 'year' , 'new_names' ] uniquenames = pd . DataFrame ( columns = [ 'year' , 'unique_names' ]) for yr in range ( 1880 , 2013 ): uniquenames = uniquenames . append ( pd . DataFrame ([{ 'year' : yr , 'unique_names' : len ( unique ( df [ df . year == yr ] . name ))}]), ignore_index = True ) years = pd . merge ( left = total , right = newnames , on = 'year' , right_index = False , left_index = False ) years = pd . merge ( left = years , right = uniquenames , on = 'year' , right_index = False , left_index = False ) #births dataframes, just the number of births per year births = df . pivot_table ( 'births' , rows = 'year' , cols = 'sex' , aggfunc = sum ) births_f = dff . pivot_table ( 'births' , rows = 'year' , cols = 'sex' , aggfunc = sum ) births_m = dfm . pivot_table ( 'births' , rows = 'year' , cols = 'sex' , aggfunc = sum ) # It takes my $400 Acer desktop computer about 15 seconds for this cell. If you want to compare, put the following, uncommented, # as the first line: # %%timeit