In [3]:
import pandas as pd
pd.__version__
Out[3]:
'0.25.1'
In [ ]:
s = pd.read_csv('Hospital_Inpatient_Discharges__SPARCS_De-Identified___2016.csv')
In [7]:
s = pd.read_csv('Hospital_Inpatient_Discharges__SPARCS_De-Identified___2016.csv',
               converters={'Total Charges':lambda x:float(x.replace(',',''))})
In [8]:
len(s)
Out[8]:
2343429
In [5]:
s.columns
Out[5]:
Index(['Health Service Area', 'Hospital County',
       'Operating Certificate Number', 'Facility Id', 'Facility Name',
       'Age Group', 'Zip Code - 3 digits', 'Gender', 'Race', 'Ethnicity',
       'Length of Stay', 'Type of Admission', 'Patient Disposition',
       'Discharge Year', 'CCS Diagnosis Code', 'CCS Diagnosis Description',
       'CCS Procedure Code', 'CCS Procedure Description', 'APR DRG Code',
       'APR DRG Description', 'APR MDC Code', 'APR MDC Description',
       'APR Severity of Illness Code', 'APR Severity of Illness Description',
       'APR Risk of Mortality', 'APR Medical Surgical Description',
       'Payment Typology 1', 'Payment Typology 2', 'Payment Typology 3',
       'Attending Provider License Number',
       'Operating Provider License Number', 'Other Provider License Number',
       'Birth Weight', 'Abortion Edit Indicator',
       'Emergency Department Indicator', 'Total Charges', 'Total Costs'],
      dtype='object')
In [14]:
s['Total Charges'].sum()
Out[14]:
107807177917.20009
In [6]:
nypop = 19.54e6
In [7]:
s['Total Charges'].sum()/nypop
Out[7]:
5517.255778771755
In [8]:
s['Total Charges'].sum()
Out[8]:
107807177917.20009
In [13]:
import numpy as np
#np.array(['hi',' ','there!'],dtype=str).sum()
In [15]:
c = s['Total Charges']
In [16]:
c.min()
Out[16]:
0.01
In [17]:
c.max()
Out[17]:
10477499.5
In [18]:
s.loc[ c.idxmax()  ]
Out[18]:
Health Service Area                                                        New York City
Hospital County                                                                Manhattan
Operating Certificate Number                                                 7.00205e+06
Facility Id                                                                         1463
Facility Name                                                       NYU Hospitals Center
Age Group                                                                        0 to 17
Zip Code - 3 digits                                                                  100
Gender                                                                                 M
Race                                                                          Other Race
Ethnicity                                                              Not Span/Hispanic
Length of Stay                                                                     120 +
Type of Admission                                                                 Urgent
Patient Disposition                                Medicare Cert Long Term Care Hospital
Discharge Year                                                                      2016
CCS Diagnosis Code                                                                   131
CCS Diagnosis Description              Respiratory failure; insufficiency; arrest (ad...
CCS Procedure Code                                                                    34
CCS Procedure Description                                        TRACHEOSTOMY; TEMP/PERM
APR DRG Code                                                                         121
APR DRG Description                                 Other respiratory & chest procedures
APR MDC Code                                                                           4
APR MDC Description                     Diseases and Disorders of the Respiratory System
APR Severity of Illness Code                                                           1
APR Severity of Illness Description                                                Minor
APR Risk of Mortality                                                              Major
APR Medical Surgical Description                                                Surgical
Payment Typology 1                                                              Self-Pay
Payment Typology 2                                                                   NaN
Payment Typology 3                                                                   NaN
Attending Provider License Number                                                 273701
Operating Provider License Number                                                 242274
Other Provider License Number                                                        NaN
Birth Weight                                                                           0
Abortion Edit Indicator                                                                N
Emergency Department Indicator                                                         N
Total Charges                                                                1.04775e+07
Total Costs                                                                  3.85336e+06
Name: 1617184, dtype: object
In [19]:
c.mean()
Out[19]:
46004.0299566149
In [20]:
c.median()
Out[20]:
24889.0

Make our own histograms

In [21]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

x = np.array([0.,23,15,5,7,29])

plt.plot(x,0*x,'o');
In [24]:
plt.plot(x,0*x,'o')
[plt.axvline(x) for x in [0,10,20,30]];
In [26]:
(x/10).astype(int)
Out[26]:
array([0, 2, 1, 0, 0, 2])
In [27]:
x//10
Out[27]:
array([0., 2., 1., 0., 0., 2.])
In [28]:
xmin = 0
binwidth = 10
((x-xmin)/binwidth).astype(int)
Out[28]:
array([0, 2, 1, 0, 0, 2])
In [37]:
nbins = 3
bincounts = np.zeros(nbins,dtype=int)
binlocations = ((x-xmin)/binwidth).astype(int)
binlocations
np.add.at( bincounts, binlocations, 1 )
bincounts
Out[37]:
array([3, 1, 2])
In [41]:
bincenters = np.linspace(xmin,xmin+binwidth*nbins,nbins,endpoint=False) + binwidth/2
#
bincenters
Out[41]:
array([ 5., 15., 25.])
In [44]:
plt.bar( bincenters, bincounts, binwidth, edgecolor='w' );
In [45]:
from histogram import histogram
In [48]:
bincenters,bincounts = histogram( x, 0, 30, 3, rawcounts=False )
plt.bar( bincenters, bincounts, binwidth ) #, edgecolor='w' );
In [49]:
bincenters,bincounts = histogram( x, 0, 30, 3, rawcounts=False )
binwidth = bincenters[1] - bincenters[0]
plt.bar( bincenters, bincounts, binwidth ) #, edgecolor='w' );
Out[49]:
<Container object of 3 artists>
In [51]:
bincenters,bincounts = histogram( c, 0, 1.5e7, 20, rawcounts=False )
binwidth = bincenters[1] - bincenters[0]
plt.bar( bincenters, bincounts, binwidth ) ;
In [53]:
bincenters,bincounts = histogram( c, 0, 1e5, 20, rawcounts=False )
binwidth = bincenters[1] - bincenters[0]
plt.bar( bincenters, bincounts, binwidth, edgecolor='w' ) ;
In [54]:
bincenters,bincounts = histogram( c, 0, 1e5, 100, rawcounts=False )
binwidth = bincenters[1] - bincenters[0]
plt.bar( bincenters, bincounts, binwidth, edgecolor='w' ) ;
In [55]:
bincenters,bincounts = histogram( c, 0, 1e5, 2, rawcounts=False )
binwidth = bincenters[1] - bincenters[0]
plt.bar( bincenters, bincounts, binwidth, edgecolor='w' ) ;

How much random noise to expect in bin heights for slowly varying distributions?

In [1]:
from IPython.display import Image
Image('20191007_160606.jpg')
Out[1]: