In [ ]:
 
In [2]:
f = open("The_first_Trump-Clinton_presidential_debate_transcript_annotated.txt")
s = f.read()
f.close()
punc = ',.;:!?"'
otherbadwords = ['--','(APPLAUSE)','(inaudible)','(LAUGHTER)','(CROSSTALK)']
for p in punc: s = s.replace(p,'')
for w in otherbadwords: s = s.replace(w,'')
speakers = ['HOLT','CLINTON','TRUMP']
for sp in speakers: s = s.replace(sp,sp+':')  # restore colons
s = s.lower()
tags = [ sp.lower()+':' for sp in speakers ]
tags
words = s.split()
#words[:50]

Idea 1

separate lists for each speaker

In [3]:
h = []
c = []
t = []
for w in words:
    if   w == tags[0]: current = h
    elif w == tags[1]: current = c
    elif w == tags[2]: current = t
    else: current.append(w)
h[:5],c[:5],t[:5]        
        
Out[3]:
(['good', 'evening', 'from', 'hofstra', 'university'],
 ['how', 'are', 'you', 'donald', 'well'],
 ['thank', 'you', 'lester', 'our', 'jobs'])
In [4]:
len(h),len(c),len(t)
Out[4]:
(1939, 6342, 8562)
In [5]:
len(set(h)),len(set(c)),len(set(t))  # counts of distinct words spoken by each
Out[5]:
(563, 1385, 1291)

Idea 2: single dictionary with triples of counts

In [6]:
d = {}
for w in h:
    if w not in d: d[w] = [1,0,0]
    else:          d[w][0] += 1
for w in c:
    if w not in d: d[w] = [0,1,0]
    else:          d[w][1] += 1
for w in t:
    if w not in d: d[w] = [0,0,1]
    else:          d[w][2] += 1
sorted( list(d.items()) )[:20]
dl = list(d.items())
dl[:10]
Out[6]:
[('104-page', [0, 0, 1]),
 ('king', [0, 1, 0]),
 ('approximately', [0, 0, 2]),
 ('seriously', [0, 0, 1]),
 ('perpetuated', [1, 0, 0]),
 ('history', [0, 1, 3]),
 ('single', [0, 1, 5]),
 ('carried', [0, 0, 1]),
 ('talks', [0, 0, 5]),
 ('provide', [1, 4, 0])]
In [7]:
## The *lambda* construct
In [8]:
f = lambda x: x*2
f(7)
Out[8]:
14
In [9]:
f('ho')
Out[9]:
'hoho'
In [8]:
sorted(dl,key=lambda x:x[1][2],reverse=True)[:10]  # sort by Trump frequency
Out[8]:
[('the', [95, 253, 295]),
 ('and', [44, 206, 289]),
 ('to', [83, 240, 258]),
 ('i', [16, 141, 240]),
 ('you', [65, 76, 206]),
 ('a', [27, 122, 172]),
 ('of', [39, 135, 171]),
 ('that', [22, 147, 167]),
 ('have', [27, 84, 147]),
 ('it', [10, 67, 127])]
In [9]:
sorted(dl,key=lambda x:x[1][1],reverse=True)[:10]  # sort by Clinton frequency
Out[9]:
[('the', [95, 253, 295]),
 ('to', [83, 240, 258]),
 ('and', [44, 206, 289]),
 ('that', [22, 147, 167]),
 ('i', [16, 141, 240]),
 ('of', [39, 135, 171]),
 ('we', [22, 131, 127]),
 ('a', [27, 122, 172]),
 ('in', [24, 104, 110]),
 ('have', [27, 84, 147])]
In [15]:
%pylab inline
Populating the interactive namespace from numpy and matplotlib
In [11]:
x = [w[1][1] for w in dl]  # Clinton 
y = [w[1][2] for w in dl]  # Trump
plot(x,y,'.',alpha=.1)
Out[11]:
[<matplotlib.lines.Line2D at 0x7fe37385eeb8>]
In [13]:
plot(x,y,'.',alpha=.5)
xlim(0,45)
ylim(0,45)
Out[13]:
(0, 45)
In [15]:
trumphi = sorted(dl,key=lambda x:x[1][2]/(x[1][1]+1),reverse=True)[:10]
trumphi
Out[15]:
[('clinton', [24, 0, 22]),
 ('leaving', [0, 0, 15]),
 ('agree', [0, 0, 14]),
 ('wrong', [0, 0, 13]),
 ("i'll", [2, 0, 12]),
 ('tremendous', [0, 0, 11]),
 ('politicians', [0, 0, 10]),
 ('she', [2, 3, 33]),
 ("they're", [0, 4, 41]),
 ('sean', [0, 0, 8])]
In [18]:
clintonhi = sorted(dl,key=lambda x:x[1][1]/(x[1][2]+1),reverse=True)[:10]
clintonhi
Out[18]:
[('donald', [3, 26, 1]),
 ('american', [8, 11, 0]),
 ('information', [0, 9, 0]),
 ('justice', [0, 7, 0]),
 ('proposed', [0, 7, 0]),
 ('everyone', [2, 6, 0]),
 ('part', [0, 6, 0]),
 ('national', [0, 6, 0]),
 ('hope', [0, 5, 0]),
 ('both', [2, 5, 0])]

Header 1

hello $\int e^x dx$

First names database

In [20]:
from glob import glob
files = sorted( glob('names/yob*.txt') )
files
Out[20]:
['names/yob1880.txt', 'names/yob1881.txt', 'names/yob1882.txt']
In [23]:
from glob import glob
files = sorted( glob('names/yob*.txt') )
for file in files:
    f = open(file)
    s = f.read()
    print(file,len(s))
    f.close()
names/yob1880.txt 22933
names/yob1881.txt 22130
names/yob1882.txt 24432
names/yob1883.txt 23918
names/yob1884.txt 26373
names/yob1885.txt 26331
names/yob1886.txt 27430
names/yob1887.txt 27158
names/yob1888.txt 30413
names/yob1889.txt 29707
names/yob1890.txt 30926
names/yob1891.txt 30526
names/yob1892.txt 33621
names/yob1893.txt 32602
names/yob1894.txt 33876
names/yob1895.txt 35183
names/yob1896.txt 35656
names/yob1897.txt 34908
names/yob1898.txt 37660
names/yob1899.txt 35099
names/yob1900.txt 43142
names/yob1901.txt 36431
names/yob1902.txt 38922
names/yob1903.txt 39290
names/yob1904.txt 41258
names/yob1905.txt 42348
names/yob1906.txt 42212
names/yob1907.txt 45881
names/yob1908.txt 46823
names/yob1909.txt 49274
names/yob1910.txt 54081
names/yob1911.txt 56894
names/yob1912.txt 74378
names/yob1913.txt 81725
names/yob1914.txt 93567
names/yob1915.txt 110085
names/yob1916.txt 114180
names/yob1917.txt 116881
names/yob1918.txt 122690
names/yob1919.txt 122343
names/yob1920.txt 126997
names/yob1921.txt 128324
names/yob1922.txt 127414
names/yob1923.txt 126003
names/yob1924.txt 128739
names/yob1925.txt 126174
names/yob1926.txt 123863
names/yob1927.txt 123464
names/yob1928.txt 120549
names/yob1929.txt 116578
names/yob1930.txt 116127
names/yob1931.txt 110263
names/yob1932.txt 111425
names/yob1933.txt 106865
names/yob1934.txt 108890
names/yob1935.txt 107242
names/yob1936.txt 105432
names/yob1937.txt 106386
names/yob1938.txt 107307
names/yob1939.txt 105888
names/yob1940.txt 106674
names/yob1941.txt 108157
names/yob1942.txt 112211
names/yob1943.txt 112079
names/yob1944.txt 108972
names/yob1945.txt 107501
names/yob1946.txt 115728
names/yob1947.txt 123681
names/yob1948.txt 122154
names/yob1949.txt 122536
names/yob1950.txt 122992
names/yob1951.txt 125000
names/yob1952.txt 127079
names/yob1953.txt 129222
names/yob1954.txt 130622
names/yob1955.txt 132481
names/yob1956.txt 135233
names/yob1957.txt 137577
names/yob1958.txt 137167
names/yob1959.txt 140227
names/yob1960.txt 142151
names/yob1961.txt 145377
names/yob1962.txt 145746
names/yob1963.txt 146564
names/yob1964.txt 148144
names/yob1965.txt 143070
names/yob1966.txt 145227
names/yob1967.txt 148242
names/yob1968.txt 154448
names/yob1969.txt 164207
names/yob1970.txt 176767
names/yob1971.txt 182823
names/yob1972.txt 184064
names/yob1973.txt 187292
names/yob1974.txt 194190
names/yob1975.txt 202140
names/yob1976.txt 207577
names/yob1977.txt 216799
names/yob1978.txt 217681
names/yob1979.txt 227362
names/yob1980.txt 231862
names/yob1981.txt 232362
names/yob1982.txt 235047
names/yob1983.txt 231451
names/yob1984.txt 233172
names/yob1985.txt 240145
names/yob1986.txt 247295
names/yob1987.txt 256918
names/yob1988.txt 268625
names/yob1989.txt 285707
names/yob1990.txt 297323
names/yob1991.txt 302034
names/yob1992.txt 306062
names/yob1993.txt 311872
names/yob1994.txt 312234
names/yob1995.txt 313055
names/yob1996.txt 316585
names/yob1997.txt 323081
names/yob1998.txt 333635
names/yob1999.txt 340770
names/yob2000.txt 355233
names/yob2001.txt 360782
names/yob2002.txt 364191
names/yob2003.txt 371360
names/yob2004.txt 381352
names/yob2005.txt 387551
names/yob2006.txt 406097
names/yob2007.txt 416201
names/yob2008.txt 417447
names/yob2009.txt 413086
names/yob2010.txt 405257
names/yob2011.txt 402894
names/yob2012.txt 400705
names/yob2013.txt 394857
names/yob2014.txt 394032
names/yob2015.txt 390954

Read all the data into memory

In [12]:
from glob import glob
from numpy import *
files = sorted( glob('names/yob*.txt') )
nyears = len(files)
def year(filename): return int(filename[-8:-4])
firstyear = year(files[0])
d = {}
gd = {'F':0,'M':1}
for file in files:
    f = open(file)
    lines = f.read().split('\n')
    for line in lines:
        if len(line)==0: continue
        name,gender,count = line.split(',')
        count = int(count)
        if name not in d:
            d[name] = zeros((2,nyears),dtype=int)
        d[name][ gd[gender], year(file)-firstyear] = count
    f.close()
d['Edward']
Out[12]:
array([[    0,     0,     5,     7,     9,     5,    11,    11,     9,
           12,    13,     6,     9,    10,    13,    13,     9,    10,
            9,     9,     8,     0,    12,    10,     7,    11,     9,
            9,     7,    12,    18,    11,    27,    42,    48,    53,
           43,    61,    63,    65,    64,    80,    93,    79,    97,
          106,   113,   112,   138,   132,   102,   101,    86,    60,
           66,    69,    64,    59,    57,    50,    45,    47,    70,
           55,    49,    44,    52,    55,    51,    50,    61,    44,
           53,    50,    45,    52,    69,    58,    65,    63,    70,
           67,    68,    65,    60,    85,    63,    61,    68,    69,
           62,    68,    78,    58,    57,    58,    50,    46,    53,
           66,    61,    46,    52,    33,    43,    44,    40,    40,
           40,    25,    19,    19,    18,    18,    18,     9,    10,
            5,     8,     0,    11,     8,     6,     5,    10,     0,
            0,     7,     0,     0,     0,     0,     0,     0,     5,
            0],
       [ 2364,  2177,  2477,  2250,  2439,  2220,  2312,  2125,  2470,
         2299,  2282,  1989,  2416,  2309,  2179,  2203,  2296,  2121,
         2337,  1901,  2720,  1917,  2294,  2268,  2334,  2366,  2398,
         2576,  2707,  2935,  3408,  4164,  7936,  9474, 12318, 15889,
        17005, 17502, 19490, 18536, 20097, 20818, 20419, 20597, 21128,
        20093, 19379, 19113, 18480, 17229, 17345, 15642, 15184, 13787,
        13922, 13836, 14193, 14924, 14547, 14428, 14400, 15579, 17465,
        17712, 16472, 15869, 18573, 20549, 18978, 19172, 18727, 19890,
        19521, 18970, 19510, 19258, 19417, 18705, 17372, 16909, 16585,
        15877, 15525, 15322, 15601, 14418, 13276, 12662, 12290, 12466,
        12299, 11063,  9304,  8375,  7825,  7391,  7054,  6848,  6481,
         6879,  6762,  6658,  6291,  5997,  5913,  5868,  5776,  5901,
         5796,  5849,  5741,  5570,  5229,  4787,  4523,  4141,  4072,
         3913,  3573,  3591,  3480,  3372,  3239,  3107,  3146,  2970,
         2867,  2821,  2786,  2979,  2901,  2660,  2589,  2698,  2578,
         2587]])
In [17]:
name = 'Edward'
plot( range(firstyear,year(files[-1])+1)   ,d[name][1] ,'b')  # males
plot( range(firstyear,year(files[-1])+1)   ,d[name][0] ,'r'); # females

Gender-predominance of some names has flipped over time

In [19]:
name = 'Leslie'
plot( range(firstyear,year(files[-1])+1)   ,d[name][0] ,'r')
plot( range(firstyear,year(files[-1])+1)   ,d[name][1] ,'b');
In [ ]: