Basic observations of the Misconduct Database ( Part 2 )
Exploring the variables in the Misconduct database
For info on how I came by this data, see part1. I wanted to break it free from looking at one or a handful of entries at a time as you find on the main website and see what I can find by looking at the data a bit differently. Keep in mind all of the caveats about this data.
In this post, I'll make some basic observations about the misconduct data that I've pulled, continuing to update the charts as I explore.
In addition, I'm practicing plotting with the seaborn package.
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="ticks")
data = pd.read_csv("instance_details.csv")
print(data.shape)
data.head()
Clean Data¶
- make penalty into a number ( instead of a string with commas and $ signs)
- make disclosed / undisclosed a separate flag
- make date into a date
- make date into year and month_year ( eg: 2014_01 )
# Flag for Non-numeric penalty
data['penalty_reported'] = (data.penalty
.map(lambda x: 1
if x[0] =='$'
else x )
)
# Amount into a numeric
data['penalty_amount'] = (data.penalty
.str.replace(',', '')
.str.replace('$', '')
.map(lambda x: int(x)
if x[0] != 'U'
else None)
)
data['penalty_amount_millions'] = data['penalty_amount'].map(
lambda x: float(x)/1000000
if x
else None)
# Date parsing
data['settle_date'] = pd.to_datetime(
data.settlementDate , infer_datetime_format =True)
data['settle_year'] = data.settle_date.dt.year
data['settle_month'] = data.settle_date.dt.month
data['settle_month_year'] = data.settle_date.map(
lambda x: "{}_{}".format(
x.year, x.month
if len(str(x.month)) > 1
else "0{}".format(x.month)
)
)
data.to_csv("clean_instance_details.csv", index=False)
p = sns.distplot(
data.penalty_amount_millions.dropna()
, kde = False).set_title('histogram of penalty amounts before removing outliers')
# remove outliers in penalty amount
outliers = data[np.abs(data.penalty_amount_millions - data.penalty_amount_millions.mean())\
>=(3*data.penalty_amount_millions.std())]
data = data[~data.index.isin(outliers.index)]
print(data.shape)
print(outliers.shape)
outliers[['contractors.text', 'settle_year','court type', 'type' , 'penalty_amount_millions']].sort_values('penalty_amount_millions', ascending = False)
outliers.url.values
p = sns.distplot(
data.penalty_amount_millions.dropna()
, kde = False
, bins = 30).set_title('histogram of penalty amounts after outlier removal')
p = sns.pointplot(data = data
, y = 'penalty_amount_millions'
, x = 'settle_year'
, order = range(1995,2016)
, estimator = np.median
).set_title('median penalty per year')
x = plt.xticks(rotation=90)
Helper functions¶
def cumulative_col(df, col):
colname= 'cum_sum__{}'.format(col)
df[colname] = df[col].cumsum()
df['cum_perc__{}'.format(col)] = 100*df[colname]/df[col].sum()
Contractor instances¶
Of the 152 contractors referenced in the database, 50% of observed misconduct instances involve one of 20 contractors.
25% of misconduct instances involve one of 7 contractors
len(data['contractors.text'].unique())
contractors = (data
.groupby('contractors.text')
.index
.count()
.sort_values(ascending=False)
.reset_index()
)
cumulative_col(contractors, 'index')
print(contractors.head(7))
cumulative = contractors.cum_perc__index.plot()
myplot = sns.distplot(
data.groupby('contractors.text').index.count().sort_values(ascending=False)
, kde=False
, bins=10
, axlabel = 'number of contractors \n binned by count of misconduct instances'
# , label = 'number of contractors'
).set_title("number of contractors with X infractions")
contractors = (data
[data.settle_year >= 2000 ]
.groupby('contractors.text')
.index
.count()
.sort_values(ascending=False)
.reset_index()
)
cumulative_col(contractors, 'index')
print(contractors.head(7))
cumulative = contractors.cum_perc__index.plot()
A Quick look at the 7 contractors¶
who are involved in 25% of all misconduct instances since 2000
First: mean penalty amount per year
To Do:
- Some between group testing
data['top'] = data['contractors.text'].isin(contractors.head(7)['contractors.text'])
fplot = sns.factorplot(
data = data[data.top == True]
, x = 'settle_year'
, y = 'penalty_amount_millions'
, estimator = np.mean
, row = 'contractors.text'
, ci = 90
, aspect = 3.5
)
x = plt.xticks(rotation=90)
The chart below displays the number of misconduct instances per year for the 7 contractors that account for 25% of all misconduct instances in the database.
fplot = sns.factorplot(
data = (data
[data.top == True]
.groupby(['settle_year', 'contractors.text', 'penalty_reported'])
.index.count()
.reset_index()
).rename(columns ={'index':'instance_count'})
, x = 'settle_year'
, y = 'instance_count'
, estimator = np.mean
, row = 'contractors.text'
, ci = 90
, size = 3
, aspect = 4
, hue = 'penalty_reported'
)
x = plt.xticks(rotation=90)
sns.factorplot(
data = data[data['type'].isin(['Environment','Health', 'Labor'])]
, x = 'settle_year'
, y = 'penalty_amount_millions'
, hue = 'top'
, row = 'type'
, estimator = np.mean
, size = 2
, aspect = 4)
x = plt.xticks(rotation=90)
fplot = sns.factorplot(
data = (data)
, x = 'settle_year'
, y = 'penalty_amount_millions'
, estimator = np.mean
, row = 'type'
, ci = 90
, size = 3
, aspect = 4
#, hue = 'top'
)
x = plt.xticks(rotation=90)
Contracting parties and enforcement agencies:¶
None and "non-govermental" are the largest contracting party / enforcement agency combination -- nearly 25% of all instances of misconduct in the dataset fall under that category.
50% of all instances fall under one of 7 agency/contracting party combinations, detailed below
cpa = ( data
.groupby(['agency','contracting party'])
.index.count()
.reset_index()
.sort_values('index', ascending=False)
)
cumulative_col(cpa, 'index')
cpa.head(6)
"None" is the contracting party on more than 60% of instances in the dataset.
Maybe the 'None' category is made up of suits brought by individuals, or someone other than the agencies represented in the category.
plt.xticks(rotation=90)
cp = data.groupby('contracting party').index.count().reset_index().sort_values('index',ascending=False)
cumulative_col(cp, 'index')
print(cp.head())
bp = (
sns.barplot(data = cp
, order = cp['contracting party']
, x ='contracting party'
, y = 'index'
, color ='steelblue')
.set_title("instances of misconduct by contracting party \n")
)
"Non-Governmental" is the prosecuting agency on 35% of instances in the dataset.
plt.xticks(rotation=90)
ca = data.groupby('agency').index.count().reset_index().sort_values('index',ascending=False)
cumulative_col(ca, 'index')
print(ca.head())
bp2 = (
sns.barplot(data = ca
, order = ca['agency']
, x ='agency'
, y = 'index'
, color ='steelblue')
.set_title("instances of misconduct by contracting party")
)
Later, take a look at the content of those misconduct instances, for now, observing that it makes up a large proportion of all instances
# Flag the subset for further exploration
data['flag'] = data.index.isin(
data[(data.agency == 'Non-Governmental') &
(data['contracting party'] == 'None')]
.index
).astype(int)
data.groupby('flag').index.count()
(data
.groupby(['settle_year', 'flag'])
.index
.count()
.unstack('flag')
.fillna(0)
).plot().set_title("number of settlements per year \n comparing flagged group to unflagged")
(
(data
.groupby(['settle_year', 'flag'])
.index.count().unstack().fillna(0)
).apply(lambda x: [100*x[k]/sum(x) for k in x.keys() ], axis = 1)
.plot()
).set_title("proportion of misconduct instances per year \n that are in the flagged group. \n Flagged = Non-Governmental/None\n")
Penalty amount and number of misconduct instances by month¶
plt.xticks(rotation=90)
p = sns.barplot(data= data
, x= 'settle_year'
, y = 'penalty_amount_millions'
, order = range(1995,2014)
#, color = 'steelblue'
, hue = 'flag'
).set_title("number of settlements with pentalty vs without penalty by year")
(
data
.groupby('settle_year')
.penalty_amount_millions
.agg([sum]).plot()
.set_title(" sum of all settlemens in millions by year")
)
Subset to look at one type of instance¶
Environment
env = data[data.type == 'Environment'][data.settle_year >= 2000 ]
envheat = ( pd.crosstab(index = env.settle_year, columns = env.agency
, values = env.penalty_amount_millions
, aggfunc = [np.mean, len, sum]
) ).stack()['sum'].unstack('agency')
sns.heatmap(envheat)
Disclosed penalty vs undisclosed¶
(data
.groupby('settle_year')
[['index','penalty_amount_millions']]
.count()
.plot()
.set_title("settlement count with payment vs \n without payment/ undiscolsed")
)
plt.xticks(rotation=90)
sns.boxplot(x="type"
, y='index'
, hue="penalty_reported",
data = (data
[data.settle_year >= 2000]
.groupby(['type','penalty_reported','settle_year'])
.index.count()
.reset_index()
)
).set_title('instances of misconduct per year by type')
A higher proportion of instances have undisclosed penalties over time¶
(data
.groupby('settle_year')
[['index','penalty_amount_millions']].count()
.apply(lambda x: 100*(1 - x['penalty_amount_millions']/x['index']), axis=1 )
).plot().set_title("percent of instances with undisclosed penalty by year ")
plt.xticks(rotation=90)
# Used for sorting the bars
abt = data.groupby('type').penalty_amount_millions.mean().reset_index().sort('penalty_amount_millions',ascending=False)
ax = (sns.barplot(
x="type"
, y="penalty_amount_millions"
, data=data
, order = abt.type
, color = 'steelblue')
.set_title("mean settlement amount by type"))
plt.xticks(rotation=90)
abtsum = (data.groupby('type')
.penalty_amount_millions
.agg([sum,np.mean, len])
.reset_index()
.sort('len', ascending=False)
)
print(abtsum.head())
ax = sns.barplot(
x="type"
, y= 'mean' #"penalty_amount_millions"
, data=abtsum
, order = abtsum.sort('len', ascending=False).type #abt.type
, color = 'steelblue'
).set_title("mean settlement amount by type\n\
sorted by number of instances\n ")
k = sns.heatmap(data = data.groupby(['type','settle_year'])
.index.count().unstack()
).set_title("number of instances \n by type of misconduct and year")
k = sns.heatmap(data = data.groupby(['type','settle_year'])
.penalty_amount_millions.sum().unstack()
).set_title("sum of settlement penalties in millions\n by type of misconduct and year")
Disposition¶
sns.heatmap(data = data.groupby(['disposition','settle_year']).index.count().unstack())
sns.heatmap(data = data.groupby(['disposition','type']).index.count().unstack())
plt.xticks(rotation=90)
abt = data.groupby('disposition').index.count().reset_index().sort('index',ascending=False)
ax = sns.barplot(
x="disposition"
, y="index"
, data=abt
, order = abt.disposition
, color = 'steelblue'
).set_title("total number of misconduct instances by disposition \n")