In [1]:
import pandas as pd
s = pd.read_csv('Hospital_Inpatient_Discharges__SPARCS_De-Identified___2016.csv')
/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py:3058: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

A good histogram of Birth Weight?

In [3]:
dd = 'CCS Diagnosis Description'
b = s[ s[dd]=='Liveborn' ]
#b.head()
In [6]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
In [8]:
from histogram import histogram
bincenters,bincounts = histogram( b['Birth Weight'], 0,5000, 20, rawcounts=True )
binwidth = bincenters[1]-bincenters[0]
plt.bar( bincenters, bincounts, binwidth ); #, edgecolor='w' );
In [10]:
bincenters,bincounts = histogram( b['Birth Weight'], 0,5000, 30, rawcounts=True )
binwidth = bincenters[1]-bincenters[0]
plt.bar( bincenters, bincounts, binwidth ); 
In [11]:
bincenters,bincounts = histogram( b['Birth Weight'], 0,5000, 49, rawcounts=True )
binwidth = bincenters[1]-bincenters[0]
plt.bar( bincenters, bincounts, binwidth ); 
In [13]:
bincenters,bincounts = histogram( b['Birth Weight'], 1000,5000, 50, rawcounts=True )
binwidth = bincenters[1]-bincenters[0]
plt.bar( bincenters, bincounts, binwidth ); 
In [14]:
bincenters,bincounts = histogram( b['Birth Weight'], 0,5000, 100, rawcounts=True )
binwidth = bincenters[1]-bincenters[0]
plt.bar( bincenters, bincounts, binwidth ); 

Data is already discretized: there are hazards of rediscretizing (binning) discrete data, as seen above!

In [15]:
b['Birth Weight'].value_counts()
Out[15]:
3200    18479
3300    18291
3400    18084
3100    17243
3500    16141
        ...  
6300       19
8200       18
5900       16
9900       16
6900       15
Name: Birth Weight, Length: 90, dtype: int64

Grouping

In [28]:
df = pd.DataFrame([['fruit','apple',23],
                       ['fruit','orange',67],
                       ['fruit','banana',113],
                       ['veg','broccoli',5],
                       ['veg','carrot',15]], columns=['kind','name','number sold'])
display(df)
kind name number sold
0 fruit apple 23
1 fruit orange 67
2 fruit banana 113
3 veg broccoli 5
4 veg carrot 15
In [32]:
g = df.groupby('kind')

We can iterate over such an object

In [33]:
for item in g:
    print(type(item))
<class 'tuple'>
<class 'tuple'>
In [34]:
for item in g:
    print(len(item))
2
2
In [35]:
for x,y in g:
    print(x)
    display(y)
fruit
kind name number sold
0 fruit apple 23
1 fruit orange 67
2 fruit banana 113
veg
kind name number sold
3 veg broccoli 5
4 veg carrot 15
In [36]:
g.agg({ 'number sold': 'sum' })
Out[36]:
number sold
kind
fruit 203
veg 20
In [37]:
g.agg({ 'number sold': np.sum })
Out[37]:
number sold
kind
fruit 203
veg 20
In [38]:
g.agg({ 'number sold': 'count' })
Out[38]:
number sold
kind
fruit 3
veg 2
In [39]:
g.agg({ 'number sold': 'mean' })
Out[39]:
number sold
kind
fruit 67.666667
veg 10.000000
In [40]:
g.agg({ 'number sold': 'median' })
Out[40]:
number sold
kind
fruit 67
veg 10
In [41]:
g.agg({ 'number sold': 'median' }).rename(columns={'number sold':'median number sold'})
Out[41]:
median number sold
kind
fruit 67
veg 10
In [42]:
g.agg({ 'number sold': 'median' }).rename(columns={'number sold':'median number sold'}).reset_index()
Out[42]:
kind median number sold
0 fruit 67
1 veg 10

Use the above methods (groupby and agg) to find the most common diagnoses.

In [43]:
s.columns
Out[43]:
Index(['Health Service Area', 'Hospital County',
       'Operating Certificate Number', 'Facility Id', 'Facility Name',
       'Age Group', 'Zip Code - 3 digits', 'Gender', 'Race', 'Ethnicity',
       'Length of Stay', 'Type of Admission', 'Patient Disposition',
       'Discharge Year', 'CCS Diagnosis Code', 'CCS Diagnosis Description',
       'CCS Procedure Code', 'CCS Procedure Description', 'APR DRG Code',
       'APR DRG Description', 'APR MDC Code', 'APR MDC Description',
       'APR Severity of Illness Code', 'APR Severity of Illness Description',
       'APR Risk of Mortality', 'APR Medical Surgical Description',
       'Payment Typology 1', 'Payment Typology 2', 'Payment Typology 3',
       'Attending Provider License Number',
       'Operating Provider License Number', 'Other Provider License Number',
       'Birth Weight', 'Abortion Edit Indicator',
       'Emergency Department Indicator', 'Total Charges', 'Total Costs'],
      dtype='object')
In [51]:
s.groupby(dd).agg({dd:'count'}).rename(columns={dd:'count'}).sort_values('count',ascending=False).head()
Out[51]:
count
CCS Diagnosis Description
Liveborn 224615
Septicemia (except in labor) 111075
Osteoarthritis 67235
Mood disorders 54227
Alcohol-related disorders 50459
In [53]:
(s
 .groupby(dd)
 .agg({dd:'count'})
 .rename(columns={dd:'count'})
 .sort_values('count',ascending=False)
 .head()
)
Out[53]:
count
CCS Diagnosis Description
Liveborn 224615
Septicemia (except in labor) 111075
Osteoarthritis 67235
Mood disorders 54227
Alcohol-related disorders 50459
In [56]:
(s[s[dd].str.contains('ancer')]
 .groupby(dd)
 .agg({dd:'count'})
 .rename(columns={dd:'count'})
 .sort_values('count',ascending=False)
 .reset_index()
)
Out[56]:
CCS Diagnosis Description count
0 Cancer of bronchus; lung 9388
1 Cancer of colon 5995
2 Cancer of breast 5238
3 Cancer of prostate 4500
4 Cancer of kidney and renal pelvis 3368
5 Cancer of pancreas 2989
6 Cancer of brain and nervous system 2943
7 Cancer of rectum and anus 2518
8 Cancer of head and neck 2357
9 Cancer of liver and intrahepatic bile duct 2274
10 Cancer of uterus 2243
11 Cancer of bladder 2222
12 Cancer of stomach 2175
13 Cancer of ovary 1759
14 Cancer of other GI organs; peritoneum 1568
15 Cancer of thyroid 1232
16 Cancer of bone and connective tissue 1148
17 Cancer of esophagus 994
18 Cancer of cervix 768
19 Cancer; other and unspecified primary 630
20 Cancer of other female genital organs 418
21 Other non-epithelial cancer of skin 414
22 Cancer of other urinary organs 274
23 Cancer; other respiratory and intrathoracic 221
24 Cancer of testis 120
25 Cancer of other male genital organs 78
In [57]:
cc = (s[s[dd].str.contains('ancer')]
 .groupby(dd)
 .agg({dd:'count'})
 .rename(columns={dd:'count'})
 .sort_values('count',ascending=False)
 .reset_index()
)
In [62]:
import altair as alt
alt.renderers.enable('notebook')
alt.Chart(cc).mark_bar().encode(x='count',y=alt.Y(dd,sort=None))
Out[62]:

In [96]:
dc = 'CCS Diagnosis Code'
s[dc].value_counts()
Out[96]:
218    224615
2      111075
203     67235
657     54227
660     50459
        ...  
206        26
86         24
176        13
174        11
663         3
Name: CCS Diagnosis Code, Length: 264, dtype: int64

Li Li points out that there are forms of cancer that don't have the word "cancer" in their name:

In [6]:
dd = 'CCS Diagnosis Description'
sc = s[ (s[dd].str.contains('ancer')) | (s[dd].str.contains('elanoma')) | (s[dd].str.contains('ymphoma'))]
In [7]:
cc = ( sc
 .groupby((dd,'Age Group'))
 .agg({dd:'count'})
 .rename(columns={dd:'count'})
 .reset_index()
     )
cc
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: FutureWarning: Interpreting tuple 'by' as a list of keys, rather than a single key. Use 'by=[...]' instead of 'by=(...)'. In the future, a tuple will always mean a single key.
  
Out[7]:
CCS Diagnosis Description Age Group count
0 Cancer of bladder 0 to 17 2
1 Cancer of bladder 30 to 49 59
2 Cancer of bladder 50 to 69 757
3 Cancer of bladder 70 or Older 1404
4 Cancer of bone and connective tissue 0 to 17 142
... ... ... ...
123 Other non-epithelial cancer of skin 0 to 17 5
124 Other non-epithelial cancer of skin 18 to 29 4
125 Other non-epithelial cancer of skin 30 to 49 38
126 Other non-epithelial cancer of skin 50 to 69 137
127 Other non-epithelial cancer of skin 70 or Older 230

128 rows × 3 columns

In [9]:
alt.Chart(cc).mark_bar().encode(x='count',y=alt.Y(dd,sort=None),color='Age Group')
Out[9]:

Courtney supplied the following code to sort the bars in a nice way:

In [8]:
import altair as alt
alt.renderers.enable('notebook')
(alt.Chart(cc,title='plot of cancer diagnosis counts')
 .mark_bar()
 .encode( y=alt.Y('CCS Diagnosis Description',
                  sort=alt.EncodingSortField(field = "count", op = 'sum', order='descending')),
          x='count', 
          color = "Age Group")
)
Out[8]:

Joining tables

In [81]:
fn = pd.read_csv('firstname.csv')
ln = pd.read_csv('lastname.csv')
t  = pd.read_csv('teaching.csv',sep='\t')
In [73]:
fn
Out[73]:
person number first name
0 199 Bernard
1 216 Betsy
2 265 Darsh
3 102 Dmitri
4 42 Dorien
5 150 Eyob
6 200 John
7 293 Kevin
8 100 Li
9 214 Palak
10 201 Simone
11 38 Yihe
12 215 Yingjie
In [74]:
ln
Out[74]:
person number last name
0 19 Badzioch
1 293 Bui
2 201 Cassani
3 215 Gao
4 102 Kireyev
5 270 Knowles
6 100 Li
7 216 McCall
8 265 Nathawani
9 214 Patel
10 200 Ringland
11 150 Tadele-Manhardt
12 42 Verbruggen
13 38 Yu
In [75]:
pd.merge(fn,ln,on='person number')
Out[75]:
person number first name last name
0 216 Betsy McCall
1 265 Darsh Nathawani
2 102 Dmitri Kireyev
3 42 Dorien Verbruggen
4 150 Eyob Tadele-Manhardt
5 200 John Ringland
6 293 Kevin Bui
7 100 Li Li
8 214 Palak Patel
9 201 Simone Cassani
10 38 Yihe Yu
11 215 Yingjie Gao
In [77]:
pd.merge(fn,ln,on='person number',how='left').fillna('')
Out[77]:
person number first name last name
0 199 Bernard
1 216 Betsy McCall
2 265 Darsh Nathawani
3 102 Dmitri Kireyev
4 42 Dorien Verbruggen
5 150 Eyob Tadele-Manhardt
6 200 John Ringland
7 293 Kevin Bui
8 100 Li Li
9 214 Palak Patel
10 201 Simone Cassani
11 38 Yihe Yu
12 215 Yingjie Gao
In [78]:
pd.merge(fn,ln,on='person number',how='right').fillna('')
Out[78]:
person number first name last name
0 216 Betsy McCall
1 265 Darsh Nathawani
2 102 Dmitri Kireyev
3 42 Dorien Verbruggen
4 150 Eyob Tadele-Manhardt
5 200 John Ringland
6 293 Kevin Bui
7 100 Li Li
8 214 Palak Patel
9 201 Simone Cassani
10 38 Yihe Yu
11 215 Yingjie Gao
12 19 Badzioch
13 270 Knowles
In [79]:
pd.merge(fn,ln,on='person number',how='outer').fillna('')
Out[79]:
person number first name last name
0 199 Bernard
1 216 Betsy McCall
2 265 Darsh Nathawani
3 102 Dmitri Kireyev
4 42 Dorien Verbruggen
5 150 Eyob Tadele-Manhardt
6 200 John Ringland
7 293 Kevin Bui
8 100 Li Li
9 214 Palak Patel
10 201 Simone Cassani
11 38 Yihe Yu
12 215 Yingjie Gao
13 19 Badzioch
14 270 Knowles
In [82]:
t
Out[82]:
TERMSOURCEKEY TERM CATALOGNUMBERSOURCEKEY CLASSSECTION FACULTYSOURCEKEY COURSESOURCEKEY COURSETYPESOURCEKEY COURSEDESCRIPTION ENROLLTOTAL SECTIONCOUNT SECTIONCAPACITY ROOMCAPACITY ROOM CLASSMEETINGPATTERNSOURCEKEY CLASSSTARTTIME CLASSENDTIME CAMPUS ACADEMICORG
0 2199 Fall 2019 800TUT BAD 199 12214 TUT Thesis Guidance 1 1 999 0 Arr Arr ARR 12:00AM 12:00AM Arranged Mathematics
1 2199 Fall 2019 801TUT BAD 199 12215 TUT Reading and Conference 1 1 999 0 Arr Arr ARR 12:00AM 12:00AM Arranged Mathematics
2 2199 Fall 2019 499TUT RIN 200 16749 TUT Independent Study 2 1 999 0 Arr Arr ARR 12:00AM 12:00AM Arranged Mathematics
3 2199 Fall 2019 563LEC GX 200 19148 LEC Topics in Applied Math 12 1 9 56 Math 250 MW 2:00PM 3:50PM North Campus Mathematics
4 2199 Fall 2019 448LR 0 200 301203 LEC Data-Oriented Computing 19 1 20 56 Math 250 MW 2:00PM 3:50PM North Campus Mathematics
5 2199 Fall 2019 337LR 0 201 16125 LEC Scientific and Math Comp 30 1 30 56 Math 250 TR 12:30PM 2:20PM North Campus Mathematics
6 2199 Fall 2019 337LR R 201 16125 REC Scientific and Math Comp 30 1 30 0 Math Arr ARR 12:00AM 12:00AM North Campus Mathematics
7 2199 Fall 2019 142LR T 201 11295 LEC College Calculus 2 60 1 60 71 Knox 04 TR 9:30AM 10:50AM North Campus Mathematics
8 2199 Fall 2019 309LR T 199 14256 LEC Intro Linear Algebra 90 1 90 97 Nsc 205 TR 9:30AM 10:50AM North Campus Mathematics
In [86]:
pd.merge(fn,t,left_on='person number', right_on='FACULTYSOURCEKEY',how='left').fillna('')
Out[86]:
person number first name TERMSOURCEKEY TERM CATALOGNUMBERSOURCEKEY CLASSSECTION FACULTYSOURCEKEY COURSESOURCEKEY COURSETYPESOURCEKEY COURSEDESCRIPTION ENROLLTOTAL SECTIONCOUNT SECTIONCAPACITY ROOMCAPACITY ROOM CLASSMEETINGPATTERNSOURCEKEY CLASSSTARTTIME CLASSENDTIME CAMPUS ACADEMICORG
0 199 Bernard 2199 Fall 2019 800TUT BAD 199 12214 TUT Thesis Guidance 1 1 999 0 Arr Arr ARR 12:00AM 12:00AM Arranged Mathematics
1 199 Bernard 2199 Fall 2019 801TUT BAD 199 12215 TUT Reading and Conference 1 1 999 0 Arr Arr ARR 12:00AM 12:00AM Arranged Mathematics
2 199 Bernard 2199 Fall 2019 309LR T 199 14256 LEC Intro Linear Algebra 90 1 90 97 Nsc 205 TR 9:30AM 10:50AM North Campus Mathematics
3 216 Betsy
4 265 Darsh
5 102 Dmitri
6 42 Dorien
7 150 Eyob
8 200 John 2199 Fall 2019 499TUT RIN 200 16749 TUT Independent Study 2 1 999 0 Arr Arr ARR 12:00AM 12:00AM Arranged Mathematics
9 200 John 2199 Fall 2019 563LEC GX 200 19148 LEC Topics in Applied Math 12 1 9 56 Math 250 MW 2:00PM 3:50PM North Campus Mathematics
10 200 John 2199 Fall 2019 448LR 0 200 301203 LEC Data-Oriented Computing 19 1 20 56 Math 250 MW 2:00PM 3:50PM North Campus Mathematics
11 293 Kevin
12 100 Li
13 214 Palak
14 201 Simone 2199 Fall 2019 337LR 0 201 16125 LEC Scientific and Math Comp 30 1 30 56 Math 250 TR 12:30PM 2:20PM North Campus Mathematics
15 201 Simone 2199 Fall 2019 337LR R 201 16125 REC Scientific and Math Comp 30 1 30 0 Math Arr ARR 12:00AM 12:00AM North Campus Mathematics
16 201 Simone 2199 Fall 2019 142LR T 201 11295 LEC College Calculus 2 60 1 60 71 Knox 04 TR 9:30AM 10:50AM North Campus Mathematics
17 38 Yihe
18 215 Yingjie
In [93]:
sheets = {
'f19_reviewer.xlsx':'1kJpIeML2f2t6UxxGFzOTOQPM0xtyUgJVVBrW3I9nyjU', # except change first Z to K
'f19_movie.xlsx'   :'1rrAUcQxNTGcwZTnUEI5kZJvDGjLE7V1LvB5Rey5rY1w', # except change first Z to r
'f19_rating.xlsx'  :'108q2iAlSiq8AQUJCOVa5uj8lkjIpBIPLIr83C41amYw'} # except change first Z to q

import requests
for sheet in sheets:
                 url = 'https://docs.google.com/spreadsheets/d/' + sheets[sheet] + '/export?format=xlsx'
                 r = requests.get(url)
                 with open(sheet,'wb') as f:
                         f.write(r.content)
                 print('Downloaded '+sheet)
Downloaded f19_reviewer.xlsx
Downloaded f19_movie.xlsx
Downloaded f19_rating.xlsx
In [94]:
movie = pd.read_excel('f19_movie.xlsx')
movie.head()
Out[94]:
movie_id year title director
0 100.0 2014.0 Birdman Alejandro G. Iñárritu
1 101.0 2004.0 The Incredibles Brad Bird
2 102.0 2009.0 City Hall Shin Woo-chul
3 103.0 1999.0 Topsy Turvy Mike Leigh
4 104.0 2018.0 Avengers: EndGame Anthony RUsso
In [95]:
movie['title'].value_counts()
Out[95]:
Hero                              1
Saving Private Ryan               1
Rat Race                          1
The Martian                       1
Incredibles 2                     1
                                 ..
The Terminal                      1
Unfriended: Dark Web              1
This is the End                   1
Crouching Tiger, Hidden Dragon    1
Bicycle Thieves                   1
Name: title, Length: 107, dtype: int64
In [ ]: