In [1]:
2+2
Out[1]:
4
In [1]:
from glob import glob
files = sorted( glob('names2019/yob*.txt') )
files[:3]
Out[1]:
['names2019/yob1880.txt', 'names2019/yob1881.txt', 'names2019/yob1882.txt']
In [2]:
d = {}
firstyear = 1880
lastyear  = 2018
nyears = lastyear - firstyear + 1
print(nyears)
for file in files[:3]:
    year = int( file[-8:-4] )
    with open(file) as f:
        lines = f.read().split('\n') 
    for line in lines:
        #print(line)
        name, gender, count = line.split(',') 
        print(name,gender,count)
        break
139
Mary F 7065
Mary F 6919
Mary F 8148
In [3]:
import numpy as np
d = {}
firstyear = 1880
lastyear  = 2018
nyears = lastyear - firstyear + 1
print(nyears)
for file in files: #[:3]:
    year = int( file[-8:-4] )
    with open(file) as f:
        lines = f.read().split('\n') 
    lines = [line for line in lines if len(line)>2] # exclude blank lines
    for line in lines:
        name, gender, count = line.split(',') 
        #print(name,gender,count)
        if name not in d: 
            # create a new default entry in d for name
            d[name] = {'F': np.zeros(nyears,dtype=int)  , 
                       'M': np.zeros(nyears,dtype=int)  }
        d[name][gender][year-firstyear] = int( count ) # because count is a string
        #break
#d
139
In [5]:
d['Zoe']
Out[5]:
{'F': array([  23,   22,   25,   23,   31,   27,   25,   34,   42,   29,   42,
          34,   34,   23,   28,   34,   36,   35,   30,   27,   26,   26,
          34,   19,   27,   24,   19,   19,   23,   22,   34,   30,   37,
          28,   37,   57,   65,   57,   68,   53,   80,   90,   64,   61,
          74,   73,   58,   57,   64,   73,   54,   77,   86,   79,   79,
          90,   78,   72,   65,   55,   54,   60,   53,   40,   48,   45,
          66,   54,   68,   81,   67,   96,  100,   96,  119,  119,   92,
         143,  151,  126,  119,  123,   97,   81,   80,   78,  118,   80,
          86,   91,  132,  122,   96,  121,  105,  125,   97,  122,   94,
         108,  103,  110,  130,  174,  202,  194,  213,  246,  242,  376,
         479,  722,  981, 1192, 1334, 1726, 2065, 2362, 2692, 3237, 3785,
        4644, 4886, 5085, 5363, 4962, 5153, 4934, 4781, 5146, 6269, 6305,
        6457, 5971, 5877, 6041, 5743, 5158, 5062]),
 'M': array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  7,  0,  0,  6,  0,  0,  0,  5,  0,  0,  5,  0,
         9,  0,  0,  0,  5,  0,  5, 10,  9, 14, 16, 16,  8, 20,  7, 14, 10,
        14, 10, 16, 16, 15, 26, 17, 15, 11, 15, 12, 19, 14, 17, 15, 21, 15,
        15, 11, 14])}

list comprehensions

are great! Build a new list from an existing list

In [6]:
l = 'abcd'
[ item*3 for item in l ]
Out[6]:
['aaa', 'bbb', 'ccc', 'ddd']
In [7]:
l = 'abcd'
[ blah*3 for blah in l if blah!='b']
Out[7]:
['aaa', 'ccc', 'ddd']
In [7]:
import matplotlib.pyplot as plt
name = 'John'
plt.plot(range(firstyear,lastyear+1),d[name]['F'],color='skyblue')
plt.plot(range(firstyear,lastyear+1),d[name]['M'],color='pink');

Quiz

True or false? "John" had a huge surge in popularity around the times of the First and Second World Wars.

Not necessarily: Popularity means high frequency compared to other names.

Compute the total in each gender for all the years:

In [8]:
# make arrays of totals across the years
ftotal = np.zeros(nyears,dtype=int)
mtotal = np.zeros(nyears,dtype=int)
for name in d:
    ftotal += d[name]['F']
    mtotal += d[name]['M']
#ftotal
In [9]:
name = 'John'
plt.plot(range(firstyear,lastyear+1),d[name]['F']/ftotal,color='skyblue')
plt.plot(range(firstyear,lastyear+1),d[name]['M']/mtotal,color='pink');

So, no, "John" experienced an almost unbroken decline in popularity from 1880 to 2018.

In [13]:
plt.plot(range(firstyear,lastyear+1),ftotal,color='skyblue')
plt.plot(range(firstyear,lastyear+1),mtotal,color='pink');

Gender specificity

In [10]:
name = 'Leslie'
plt.plot(range(firstyear,lastyear+1),d[name]['F'],color='skyblue',label='F')
plt.plot(range(firstyear,lastyear+1),d[name]['M'],color='pink',label='M');
plt.legend()
Out[10]:
<matplotlib.legend.Legend at 0x7f2f865dcc50>

Plot female:male ratio

In [11]:
plt.plot(range(firstyear,lastyear+1),
         d[name]['F']/d[name]['M'],
         color='green')
plt.ylabel('F/M')
Out[11]:
Text(0,0.5,'F/M')

Let's superimpose plots for many names

In [16]:
np.seterr(divide='ignore',invalid='ignore') 
for i,name in enumerate( d.keys() ):
    plt.plot(range(firstyear,lastyear+1),
         d[name]['F']/d[name]['M'],
         color='green',alpha=0.03)
    if i>1000: break
plt.ylabel('F/M')
Out[16]:
Text(0,0.5,'F/M')

Linear scales distinguish big and small, but not big, small and very small.

In [50]:
salaries = {'minimum_wage_worker': 11.10*40*50,
           'Candace Johnson':1.2e6, 
            'Elon Musk':513e6 }
In [51]:
import altair as alt
alt.renderers.enable('notebook')
Out[51]:
RendererRegistry.enable('notebook')
In [52]:
import pandas as pd
In [53]:
sdf = pd.DataFrame( salaries.items(), columns=['who','$'] )
sdf
Out[53]:
who $
0 minimum_wage_worker 22200.0
1 Candace Johnson 1200000.0
2 Elon Musk 513000000.0
In [54]:
alt.Chart(sdf).mark_point().encode(x='who',y='$')
/usr/local/lib/python3.6/dist-packages/altair/utils/core.py:294: FutureWarning: A future version of pandas will default to `skipna=True`. To silence this warning, pass `skipna=True|False` explicitly.
  attrs['type'] = infer_vegalite_type(data[attrs['field']])
Out[54]:

In [55]:
sdf = pd.DataFrame( salaries.items(), columns=['who','$'] )
sdf['log10 $'] = sdf['$'].map( np.log10  ) 
sdf
Out[55]:
who $ log10 $
0 minimum_wage_worker 22200.0 4.346353
1 Candace Johnson 1200000.0 6.079181
2 Elon Musk 513000000.0 8.710117
In [56]:
alt.Chart(sdf).mark_point().encode(x='who',y='log10 $')
Out[56]:

In [59]:
alt.Chart(sdf).mark_point().encode(x='who',y=alt.Y('log10 $',
                                                   scale=alt.Scale(zero=False)))
Out[59]:

In [60]:
alt.Chart(sdf).mark_point().encode(x='who',y=alt.Y('$',
                                                   scale=alt.Scale(type='log',zero=False)))
Out[60]:

In [ ]:
Apply logarithmic scaling to the F/M ratio of names:
In [16]:
print(len(d),'names altogether')
98400 names altogether
In [18]:
np.seterr(divide='ignore',invalid='ignore') 
plt.figure(figsize=(15,8))
names = list(d.keys())
np.random.shuffle( names )
for i,name in enumerate( names  ):
    plt.plot(range(firstyear,lastyear+1),
         np.log10(d[name]['F']/d[name]['M']),
         color='green',alpha=0.2)
    if i>30000: break
plt.ylabel('log10 F/M');