from datascience import *
import numpy as np
import seaborn as sns

Table.interactive_plots()


tips = Table.from_df(sns.load_dataset('tips'))
tips


tips.group('time').barh('time')


tips.group('day', np.mean) \
    .select('day', 'total_bill mean', 'tip mean') \
    .take(3, 0, 1, 2)


tips.group('day', np.mean) \
    .select('day', 'total_bill mean', 'tip mean') \
    .take(3, 0, 1, 2) \
    .barh('day', xaxis_title = 'Dollars', 
                 yaxis_title = 'Day of the Week', 
                 title = 'Tips and Total Bills on Various Days of the Week')


tips


tips.hist('tip', density = False)


tips.where('tip', are.between(1.9, 2.8))


tips.where('tip', are.between(1.9, 2.8)).num_rows

79


tips.where('tip', are.between(4.6, 5.5)).num_rows

19


tips.hist('tip')


numbers = Table().with_columns(
    'Height', np.array([72, 61, 63, 74, 68, 67, 65, 73, 65, 62, 66, 69, 75, 61, 61, 61, 65, 60, 64])
)

numbers.hist('Height', density = False, bins = [60, 64, 68, 72, 76])


tips.hist('tip', density = False,
                 xaxis_title = 'Tip (Dollars)',
                 title = 'Distribution of Tips',
                 width = 600, height = 600)


tips.hist('tip', 
          density = False, 
          bins = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]))


tips.hist('tip', 
          density = False, 
          bins = np.arange(12))


tips.hist('total_bill', density = False)


tips.column('total_bill').min()

3.07


tips.column('total_bill').max()

50.81


bins_3 = np.arange(3, 54, 3)
bins_3

array([ 3,  6,  9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51])


tips.hist('total_bill', 
          density = False, 
          bins = bins_3, 
          title = 'Distribution of Total Bills, Bin Width = 3', 
          width = 600, height = 400)


bins_7 = np.arange(3, 53, 7)
bins_7

array([ 3, 10, 17, 24, 31, 38, 45, 52])


tips.hist('total_bill', 
          density = False, 
          bins = bins_7, 
          title = 'Distribution of Total Bills, Bin Width = 7', 
          width = 600, height = 400)


bins_10 = np.arange(3, 63, 10)
bins_10

array([ 3, 13, 23, 33, 43, 53])


tips.hist('total_bill', 
          density = False, 
          bins = bins_10, 
          title = 'Distribution of Total Bills, Bin Width = 10', 
          width = 600, height = 400)


tips


tips.hist('total_bill', density = False, group = 'time')

/opt/conda/lib/python3.8/site-packages/datascience/tables.py:920: VisibleDeprecationWarning:

Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray


tips.hist('total_bill', density = False, group = 'time', bins = bins_3)


tips.hist('total_bill', density = False, group = 'time', overlay = False, width = 700, height = 500)


tips


tips.hist('total_bill', density = False, group = 'day', width = 700, height = 400)


# Tip proportion
tip_pct = 100 * tips.column('tip') / tips.column('total_bill')
tips = tips.with_columns(
    'tip percentage', tip_pct
)

tips


tips.where('tip percentage', are.below(25)) \
    .hist('tip percentage',
          density = False,
          bins = np.arange(0, 30, 5),
          title = "Distribution of Tip Percentages",
          xaxis_title = "Tip Percentage",
          group = 'time'
         )


tips.hist?

total_bill	tip	sex	smoker	day	time	size
16.99	1.01	Female	No	Sun	Dinner	2
10.34	1.66	Male	No	Sun	Dinner	3
21.01	3.5	Male	No	Sun	Dinner	3
23.68	3.31	Male	No	Sun	Dinner	2
24.59	3.61	Female	No	Sun	Dinner	4
25.29	4.71	Male	No	Sun	Dinner	4
8.77	2	Male	No	Sun	Dinner	2
26.88	3.12	Male	No	Sun	Dinner	4
15.04	1.96	Male	No	Sun	Dinner	2
14.78	3.23	Male	No	Sun	Dinner	2

day	total_bill mean	tip mean
Thur	17.6827	2.77145
Fri	17.1516	2.73474
Sat	20.4414	2.9931
Sun	21.41	3.25513

total_bill	tip	sex	smoker	day	time	size
16.99	1.01	Female	No	Sun	Dinner	2
10.34	1.66	Male	No	Sun	Dinner	3
21.01	3.5	Male	No	Sun	Dinner	3
23.68	3.31	Male	No	Sun	Dinner	2
24.59	3.61	Female	No	Sun	Dinner	4
25.29	4.71	Male	No	Sun	Dinner	4
8.77	2	Male	No	Sun	Dinner	2
26.88	3.12	Male	No	Sun	Dinner	4
15.04	1.96	Male	No	Sun	Dinner	2
14.78	3.23	Male	No	Sun	Dinner	2

total_bill	tip	sex	smoker	day	time	size
8.77	2	Male	No	Sun	Dinner	2
15.04	1.96	Male	No	Sun	Dinner	2
20.29	2.75	Female	No	Sat	Dinner	2
15.77	2.23	Female	No	Sat	Dinner	2
17.81	2.34	Male	No	Sat	Dinner	4
13.37	2	Male	No	Sat	Dinner	2
12.69	2	Male	No	Sat	Dinner	2
18.35	2.5	Male	No	Sat	Dinner	4
20.69	2.45	Female	No	Sat	Dinner	4
16.31	2	Male	No	Sat	Dinner	3

total_bill	tip	sex	smoker	day	time	size
16.99	1.01	Female	No	Sun	Dinner	2
10.34	1.66	Male	No	Sun	Dinner	3
21.01	3.5	Male	No	Sun	Dinner	3
23.68	3.31	Male	No	Sun	Dinner	2
24.59	3.61	Female	No	Sun	Dinner	4
25.29	4.71	Male	No	Sun	Dinner	4
8.77	2	Male	No	Sun	Dinner	2
26.88	3.12	Male	No	Sun	Dinner	4
15.04	1.96	Male	No	Sun	Dinner	2
14.78	3.23	Male	No	Sun	Dinner	2

Lecture 25 – Visualizing Numerical Variables¶

Data 6, Summer 2021¶

Review: bar charts¶

Histograms¶

Aside: can confirm results using `where` and `are.between`¶

Why do we need `density = False`?¶

Quick Check 1¶

Customization¶

Choosing bins¶

`np.arange`, revisited¶

Overlaid and side-by-side histograms¶

Quick Check 2¶

Documentation¶

Lecture 25 – Visualizing Numerical Variables¶

Data 6, Summer 2021¶

Review: bar charts¶

Histograms¶

Aside: can confirm results using where and are.between¶

Why do we need density = False?¶

Quick Check 1¶

Customization¶

Choosing bins¶

np.arange, revisited¶

Overlaid and side-by-side histograms¶

Quick Check 2¶

Documentation¶

Aside: can confirm results using `where` and `are.between`¶

Why do we need `density = False`?¶

`np.arange`, revisited¶