In [1]:
from glob import glob
from numpy import *
files = sorted( glob('names/yob*.txt') )
nyears = len(files)
def year(filename): return int(filename[-8:-4])
firstyear = year(files[0])
d = {}
gd = {'F':0,'M':1}
for file in files:
    f = open(file)
    lines = f.read().split('\n')
    for line in lines:
        if len(line)==0: continue
        name,gender,count = line.split(',')
        count = int(count)
        if name not in d:
            d[name] = zeros((2,nyears),dtype=int)
        d[name][ gd[gender], year(file)-firstyear] = count
    f.close()
d['Edward']
Out[1]:
array([[    0,     0,     5,     7,     9,     5,    11,    11,     9,
           12,    13,     6,     9,    10,    13,    13,     9,    10,
            9,     9,     8,     0,    12,    10,     7,    11,     9,
            9,     7,    12,    18,    11,    27,    42,    48,    53,
           43,    61,    63,    65,    64,    80,    93,    79,    97,
          106,   113,   112,   138,   132,   102,   101,    86,    60,
           66,    69,    64,    59,    57,    50,    45,    47,    70,
           55,    49,    44,    52,    55,    51,    50,    61,    44,
           53,    50,    45,    52,    69,    58,    65,    63,    70,
           67,    68,    65,    60,    85,    63,    61,    68,    69,
           62,    68,    78,    58,    57,    58,    50,    46,    53,
           66,    61,    46,    52,    33,    43,    44,    40,    40,
           40,    25,    19,    19,    18,    18,    18,     9,    10,
            5,     8,     0,    11,     8,     6,     5,    10,     0,
            0,     7,     0,     0,     0,     0,     0,     0,     5,
            0],
       [ 2364,  2177,  2477,  2250,  2439,  2220,  2312,  2125,  2470,
         2299,  2282,  1989,  2416,  2309,  2179,  2203,  2296,  2121,
         2337,  1901,  2720,  1917,  2294,  2268,  2334,  2366,  2398,
         2576,  2707,  2935,  3408,  4164,  7936,  9474, 12318, 15889,
        17005, 17502, 19490, 18536, 20097, 20818, 20419, 20597, 21128,
        20093, 19379, 19113, 18480, 17229, 17345, 15642, 15184, 13787,
        13922, 13836, 14193, 14924, 14547, 14428, 14400, 15579, 17465,
        17712, 16472, 15869, 18573, 20549, 18978, 19172, 18727, 19890,
        19521, 18970, 19510, 19258, 19417, 18705, 17372, 16909, 16585,
        15877, 15525, 15322, 15601, 14418, 13276, 12662, 12290, 12466,
        12299, 11063,  9304,  8375,  7825,  7391,  7054,  6848,  6481,
         6879,  6762,  6658,  6291,  5997,  5913,  5868,  5776,  5901,
         5796,  5849,  5741,  5570,  5229,  4787,  4523,  4141,  4072,
         3913,  3573,  3591,  3480,  3372,  3239,  3107,  3146,  2970,
         2867,  2821,  2786,  2979,  2901,  2660,  2589,  2698,  2578,
         2587]])
In [2]:
%pylab inline
Populating the interactive namespace from numpy and matplotlib
/home/ringland/anaconda3/lib/python3.5/site-packages/IPython/core/magics/pylab.py:161: UserWarning: pylab import has clobbered these variables: ['f']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"

Let's plot the "gender bias" of the name Leslie over time. I.e. the ratio of frequency in females to frequency in males:

In [3]:
name='Leslie'
plot( range(firstyear,year(files[-1])+1)   ,d[name][0]/d[name][1] ,'g');

The above plot of the ratio of female to male frequency of the name is not satisfactory because the male-dominant part of the history is all squashed into invisibility near the t-axis.

More symmetrical if we take the log of the ratio:

In [4]:
semilogy( range(firstyear,year(files[-1])+1)   ,d[name][0]/d[name][1] ,'g');

Now we can see the detail at both extremes!

Next, let's do it for all names in the database:

In [6]:
figure(figsize=(15,6))
for name in d:
    if d[name][0].sum()>0 and d[name][1].sum()>0: # if there is at least on M and one F in at least one year
        semilogy( range(firstyear,year(files[-1])+1)   ,d[name][0]/d[name][1] ,'g',alpha=0.1);
ylabel('Female/Male')
/home/ringland/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:4: RuntimeWarning: divide by zero encountered in true_divide
/home/ringland/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:4: RuntimeWarning: invalid value encountered in true_divide
Out[6]:
<matplotlib.text.Text at 0x7f6c47242d68>
/home/ringland/anaconda3/lib/python3.5/site-packages/matplotlib/scale.py:93: RuntimeWarning: invalid value encountered in less_equal
  mask = a <= 0.0