from datascience import *
import numpy as np
import seaborn as sns
Table.interactive_plots()
tips = Table.from_df(sns.load_dataset('tips'))
tips
total_bill | tip | sex | smoker | day | time | size |
---|---|---|---|---|---|---|
16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
21.01 | 3.5 | Male | No | Sun | Dinner | 3 |
23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
25.29 | 4.71 | Male | No | Sun | Dinner | 4 |
8.77 | 2 | Male | No | Sun | Dinner | 2 |
26.88 | 3.12 | Male | No | Sun | Dinner | 4 |
15.04 | 1.96 | Male | No | Sun | Dinner | 2 |
14.78 | 3.23 | Male | No | Sun | Dinner | 2 |
... (234 rows omitted)
tips.group('time').barh('time')
tips.group('day', np.mean) \
.select('day', 'total_bill mean', 'tip mean') \
.take(3, 0, 1, 2)
day | total_bill mean | tip mean |
---|---|---|
Thur | 17.6827 | 2.77145 |
Fri | 17.1516 | 2.73474 |
Sat | 20.4414 | 2.9931 |
Sun | 21.41 | 3.25513 |
tips.group('day', np.mean) \
.select('day', 'total_bill mean', 'tip mean') \
.take(3, 0, 1, 2) \
.barh('day', xaxis_title = 'Dollars',
yaxis_title = 'Day of the Week',
title = 'Tips and Total Bills on Various Days of the Week')
tips
total_bill | tip | sex | smoker | day | time | size |
---|---|---|---|---|---|---|
16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
21.01 | 3.5 | Male | No | Sun | Dinner | 3 |
23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
25.29 | 4.71 | Male | No | Sun | Dinner | 4 |
8.77 | 2 | Male | No | Sun | Dinner | 2 |
26.88 | 3.12 | Male | No | Sun | Dinner | 4 |
15.04 | 1.96 | Male | No | Sun | Dinner | 2 |
14.78 | 3.23 | Male | No | Sun | Dinner | 2 |
... (234 rows omitted)
tips.hist('tip', density = False)
where
and are.between
¶tips.where('tip', are.between(1.9, 2.8))
total_bill | tip | sex | smoker | day | time | size |
---|---|---|---|---|---|---|
8.77 | 2 | Male | No | Sun | Dinner | 2 |
15.04 | 1.96 | Male | No | Sun | Dinner | 2 |
20.29 | 2.75 | Female | No | Sat | Dinner | 2 |
15.77 | 2.23 | Female | No | Sat | Dinner | 2 |
17.81 | 2.34 | Male | No | Sat | Dinner | 4 |
13.37 | 2 | Male | No | Sat | Dinner | 2 |
12.69 | 2 | Male | No | Sat | Dinner | 2 |
18.35 | 2.5 | Male | No | Sat | Dinner | 4 |
20.69 | 2.45 | Female | No | Sat | Dinner | 4 |
16.31 | 2 | Male | No | Sat | Dinner | 3 |
... (69 rows omitted)
tips.where('tip', are.between(1.9, 2.8)).num_rows
79
tips.where('tip', are.between(4.6, 5.5)).num_rows
19
density = False
?¶Look at the histogram that results if we don't set density = False
.
tips.hist('tip')
This is a perfectly valid histogram too, but it's not one that we will study in this class.
numbers = Table().with_columns(
'Height', np.array([72, 61, 63, 74, 68, 67, 65, 73, 65, 62, 66, 69, 75, 61, 61, 61, 65, 60, 64])
)
numbers.hist('Height', density = False, bins = [60, 64, 68, 72, 76])
We can use the same customization arguments with hist
as we did with barh
.
tips.hist('tip', density = False,
xaxis_title = 'Tip (Dollars)',
title = 'Distribution of Tips',
width = 600, height = 600)
tips.hist('tip',
density = False,
bins = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]))
np.arange
, revisited¶tips.hist('tip',
density = False,
bins = np.arange(12))
Let's look at another column.
tips.hist('total_bill', density = False)
Before setting bins, it's a good idea to look at the smallest and largest values in the column.
tips.column('total_bill').min()
3.07
tips.column('total_bill').max()
50.81
bins_3 = np.arange(3, 54, 3)
bins_3
array([ 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51])
tips.hist('total_bill',
density = False,
bins = bins_3,
title = 'Distribution of Total Bills, Bin Width = 3',
width = 600, height = 400)
bins_7 = np.arange(3, 53, 7)
bins_7
array([ 3, 10, 17, 24, 31, 38, 45, 52])
tips.hist('total_bill',
density = False,
bins = bins_7,
title = 'Distribution of Total Bills, Bin Width = 7',
width = 600, height = 400)
bins_10 = np.arange(3, 63, 10)
bins_10
array([ 3, 13, 23, 33, 43, 53])
tips.hist('total_bill',
density = False,
bins = bins_10,
title = 'Distribution of Total Bills, Bin Width = 10',
width = 600, height = 400)
tips
total_bill | tip | sex | smoker | day | time | size |
---|---|---|---|---|---|---|
16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
21.01 | 3.5 | Male | No | Sun | Dinner | 3 |
23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
25.29 | 4.71 | Male | No | Sun | Dinner | 4 |
8.77 | 2 | Male | No | Sun | Dinner | 2 |
26.88 | 3.12 | Male | No | Sun | Dinner | 4 |
15.04 | 1.96 | Male | No | Sun | Dinner | 2 |
14.78 | 3.23 | Male | No | Sun | Dinner | 2 |
... (234 rows omitted)
One category is 'time'
– we can make separate histograms for every unique value in 'time'
. As a reminder, there are two unique times, 'Lunch'
and 'Dinner'
, so we should expect to see two histograms.
tips.hist('total_bill', density = False, group = 'time')
/opt/conda/lib/python3.8/site-packages/datascience/tables.py:920: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
tips.hist('total_bill', density = False, group = 'time', bins = bins_3)
If we want these on separate axes:
tips.hist('total_bill', density = False, group = 'time', overlay = False, width = 700, height = 500)
Note that for whatever reason, using group
, overlay
, and bins
with an array all at the same time doesn't work. (I've raised the issue with the folks who maintain the datascience
module.)
We could separate by other columns, like 'day'
.
tips
total_bill | tip | sex | smoker | day | time | size |
---|---|---|---|---|---|---|
16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
21.01 | 3.5 | Male | No | Sun | Dinner | 3 |
23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
25.29 | 4.71 | Male | No | Sun | Dinner | 4 |
8.77 | 2 | Male | No | Sun | Dinner | 2 |
26.88 | 3.12 | Male | No | Sun | Dinner | 4 |
15.04 | 1.96 | Male | No | Sun | Dinner | 2 |
14.78 | 3.23 | Male | No | Sun | Dinner | 2 |
... (234 rows omitted)
tips.hist('total_bill', density = False, group = 'day', width = 700, height = 400)
There's too much going on there – but you can click the legend to hide certain days.
# Tip proportion
tip_pct = 100 * tips.column('tip') / tips.column('total_bill')
tips = tips.with_columns(
'tip percentage', tip_pct
)
tips
total_bill | tip | sex | smoker | day | time | size | tip percentage |
---|---|---|---|---|---|---|---|
16.99 | 1.01 | Female | No | Sun | Dinner | 2 | 5.94467 |
10.34 | 1.66 | Male | No | Sun | Dinner | 3 | 16.0542 |
21.01 | 3.5 | Male | No | Sun | Dinner | 3 | 16.6587 |
23.68 | 3.31 | Male | No | Sun | Dinner | 2 | 13.978 |
24.59 | 3.61 | Female | No | Sun | Dinner | 4 | 14.6808 |
25.29 | 4.71 | Male | No | Sun | Dinner | 4 | 18.624 |
8.77 | 2 | Male | No | Sun | Dinner | 2 | 22.805 |
26.88 | 3.12 | Male | No | Sun | Dinner | 4 | 11.6071 |
15.04 | 1.96 | Male | No | Sun | Dinner | 2 | 13.0319 |
14.78 | 3.23 | Male | No | Sun | Dinner | 2 | 21.8539 |
... (234 rows omitted)
tips.where('tip percentage', are.below(25)) \
.hist('tip percentage',
density = False,
bins = np.arange(0, 30, 5),
title = "Distribution of Tip Percentages",
xaxis_title = "Tip Percentage",
group = 'time'
)
Run the following cell.
tips.hist?