from datascience import *
import numpy as np
Table.interactive_plots()
import plotly.express as px
sky = Table.read_table('data/skyscrapers.csv') \
.where('status.current', are.contained_in(['completed', 'under construction'])) \
.select('name', 'location.city', 'location.latitude', 'location.longitude',
'statistics.floors above', 'statistics.height', 'status.completed.year') \
.relabeled(['location.city', 'location.latitude', 'location.longitude',
'statistics.floors above', 'statistics.height', 'status.completed.year'],
['city', 'latitude', 'longitude', 'floors', 'height', 'year']) \
.where('height', are.above(0)) \
.where('floors', are.above(0))
sky
name | city | latitude | longitude | floors | height | year |
---|---|---|---|---|---|---|
One World Trade Center | New York City | 40.7131 | -74.0134 | 94 | 541.3 | 2014 |
Central Park Tower | New York City | 40.7664 | -73.9809 | 95 | 541.02 | 2019 |
Willis Tower | Chicago | 41.8789 | -87.6359 | 108 | 442.14 | 1974 |
111 West 57th Street | New York City | 40.7648 | -73.9775 | 80 | 438.3 | 2018 |
432 Park Avenue | New York City | 40.7616 | -73.9719 | 85 | 425.5 | 2015 |
Trump International Hotel & Tower | Chicago | 41.8889 | -87.6264 | 98 | 423.22 | 2009 |
30 Hudson Yards | New York City | 40.754 | -74.0008 | 73 | 386.61 | 2019 |
Empire State Building | New York City | 40.7484 | -73.9856 | 102 | 381 | 1931 |
Bank of America Tower | New York City | 40.7554 | -73.9844 | 55 | 365.8 | 2009 |
Aon Center | Chicago | 41.8852 | -87.6215 | 83 | 346.26 | 1973 |
... (2145 rows omitted)
sky.group('city') \
.where('count', are.above_or_equal_to(40)) \
.sort('count', descending = True) \
.barh('city', title = 'Number of Skyscrapers Per City')
# Remember, you're not responsible for the code here.
px.pie(sky.group('city').where('count', are.above_or_equal_to(40)).to_df(),
values = 'count',
names = 'city',
title = 'Number of Skyscrapers Per City (Top 10 Only)'
)
sky.shuffle()
name | city | latitude | longitude | floors | height | year |
---|---|---|---|---|---|---|
City Centre | Seattle | 47.6106 | -122.335 | 44 | 184.82 | 1989 |
PNC Tower | Louisville | 38.2562 | -85.7579 | 40 | 156.06 | 1972 |
Park Millennium | Chicago | 41.8866 | -87.6214 | 57 | 165.88 | 2002 |
420 West 42nd Street | New York City | 40.7585 | -73.9939 | 41 | 121.9 | 2001 |
Avalon on the Sound East | New Rochelle | 40.9095 | -73.7838 | 39 | 114.3 | 2008 |
Continental Life Building | St. Louis | 38.6385 | -90.2325 | 22 | 87.1 | 1930 |
Tribune Tower | Chicago | 41.8904 | -87.6236 | 34 | 141.12 | 1925 |
Sullivan Center | Chicago | 41.8817 | -87.6274 | 12 | 63.12 | 1903 |
3630 Peachtree Road | Atlanta | 33.8545 | -84.3586 | 40 | 142.95 | 2009 |
One Kansas City Place | Kansas City | 39.0997 | -94.5838 | 42 | 189.89 | 1988 |
... (2145 rows omitted)
sky.group('city') \
.where('count', are.above_or_equal_to(20)) \
.sort('count', descending = True)
city | count |
---|---|
New York City | 558 |
Chicago | 376 |
Miami | 107 |
San Francisco | 65 |
Houston | 61 |
Seattle | 60 |
Honolulu | 56 |
Los Angeles | 51 |
Las Vegas | 43 |
Minneapolis | 39 |
... (9 rows omitted)
sky.group('city') \
.where('count', are.above_or_equal_to(20)) \
.sort('count', descending = True) \
.barh('city', title = 'Number of Skyscrapers Per City (Min. 20)')
Do any of the above cities stick out to you?
sky.column('height').min()
35.970001220703
sky.column('height').max()
541.29998779297
sky.hist('height', density = False, bins = np.arange(0, 600, 25),
title = 'Distribution of Skyscraper Heights')
Let's zoom in a little more.
sky.where('height', are.below(300)) \
.hist('height', density = False, bins = np.arange(0, 310, 10),
title = 'Distribution of Skyscraper Heights Below 300m')
sky
name | city | latitude | longitude | floors | height | year |
---|---|---|---|---|---|---|
One World Trade Center | New York City | 40.7131 | -74.0134 | 94 | 541.3 | 2014 |
Central Park Tower | New York City | 40.7664 | -73.9809 | 95 | 541.02 | 2019 |
Willis Tower | Chicago | 41.8789 | -87.6359 | 108 | 442.14 | 1974 |
111 West 57th Street | New York City | 40.7648 | -73.9775 | 80 | 438.3 | 2018 |
432 Park Avenue | New York City | 40.7616 | -73.9719 | 85 | 425.5 | 2015 |
Trump International Hotel & Tower | Chicago | 41.8889 | -87.6264 | 98 | 423.22 | 2009 |
30 Hudson Yards | New York City | 40.754 | -74.0008 | 73 | 386.61 | 2019 |
Empire State Building | New York City | 40.7484 | -73.9856 | 102 | 381 | 1931 |
Bank of America Tower | New York City | 40.7554 | -73.9844 | 55 | 365.8 | 2009 |
Aon Center | Chicago | 41.8852 | -87.6215 | 83 | 346.26 | 1973 |
... (2145 rows omitted)
Let's say a skyscraper is "short" if its height is less than or equal to 150 meters; otherwise, it's "tall".
def height_cat(height):
if height <= 150:
return 'short'
return 'tall'
sky.apply(height_cat, 'height')
array(['tall', 'tall', 'tall', ..., 'short', 'short', 'short'], dtype='<U5')
sky = sky.with_columns('height category', sky.apply(height_cat, 'height'))
sky
name | city | latitude | longitude | floors | height | year | height category |
---|---|---|---|---|---|---|---|
One World Trade Center | New York City | 40.7131 | -74.0134 | 94 | 541.3 | 2014 | tall |
Central Park Tower | New York City | 40.7664 | -73.9809 | 95 | 541.02 | 2019 | tall |
Willis Tower | Chicago | 41.8789 | -87.6359 | 108 | 442.14 | 1974 | tall |
111 West 57th Street | New York City | 40.7648 | -73.9775 | 80 | 438.3 | 2018 | tall |
432 Park Avenue | New York City | 40.7616 | -73.9719 | 85 | 425.5 | 2015 | tall |
Trump International Hotel & Tower | Chicago | 41.8889 | -87.6264 | 98 | 423.22 | 2009 | tall |
30 Hudson Yards | New York City | 40.754 | -74.0008 | 73 | 386.61 | 2019 | tall |
Empire State Building | New York City | 40.7484 | -73.9856 | 102 | 381 | 1931 | tall |
Bank of America Tower | New York City | 40.7554 | -73.9844 | 55 | 365.8 | 2009 | tall |
Aon Center | Chicago | 41.8852 | -87.6215 | 83 | 346.26 | 1973 | tall |
... (2145 rows omitted)
We can use pivot
to draw a bar chart of the number of short and tall skyscrapers per city.
Fill in the blanks to create the table short_and_tall
, which has two columns, 'short'
and 'tall'
, and one row for each city with at least 5 short and 5 tall skyscrapers. The first five rows of short_and_tall
are shown below.
city | short | tall |
---|---|---|
New York City | 341 | 217 |
Chicago | 268 | 108 |
Miami | 58 | 49 |
Houston | 34 | 27 |
San Francisco | 43 | 22 |
short_and_tall = sky.pivot(__(a)__, __(b)__) \
.where(__(c)__, are.above_or_equal_to(5)) \
.where('tall', are.above_or_equal_to(5)) \
.sort('tall', descending = True)
short_and_tall = sky.pivot('height category', 'city') \
.where('short', are.above_or_equal_to(5)) \
.where('tall', are.above_or_equal_to(5)) \
.sort('tall', descending = True)
/opt/conda/lib/python3.8/site-packages/datascience/tables.py:920: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
short_and_tall.barh('city', title = 'Number of Short and Tall Skyscrapers Per City (Min. 5 Each)')
It seems like most cities have roughly twice as many "short" skyscrapers as they do "tall" skyscrapers.
What if we want to look at the distribution of the number of floors per skyscraper, separated by height category?
sky.hist('floors', group = 'height category',
density = False,
bins = np.arange(0, 150, 5),
title = 'Distribution of Number of Floors Per Skyscraper')
Since there is overlap between the two histograms, we have that there are some short skyscrapers (below 150m) with more floors than some tall skyscrapers!
sky
name | city | latitude | longitude | floors | height | year | height category |
---|---|---|---|---|---|---|---|
One World Trade Center | New York City | 40.7131 | -74.0134 | 94 | 541.3 | 2014 | tall |
Central Park Tower | New York City | 40.7664 | -73.9809 | 95 | 541.02 | 2019 | tall |
Willis Tower | Chicago | 41.8789 | -87.6359 | 108 | 442.14 | 1974 | tall |
111 West 57th Street | New York City | 40.7648 | -73.9775 | 80 | 438.3 | 2018 | tall |
432 Park Avenue | New York City | 40.7616 | -73.9719 | 85 | 425.5 | 2015 | tall |
Trump International Hotel & Tower | Chicago | 41.8889 | -87.6264 | 98 | 423.22 | 2009 | tall |
30 Hudson Yards | New York City | 40.754 | -74.0008 | 73 | 386.61 | 2019 | tall |
Empire State Building | New York City | 40.7484 | -73.9856 | 102 | 381 | 1931 | tall |
Bank of America Tower | New York City | 40.7554 | -73.9844 | 55 | 365.8 | 2009 | tall |
Aon Center | Chicago | 41.8852 | -87.6215 | 83 | 346.26 | 1973 | tall |
... (2145 rows omitted)
sky.scatter('height', 'floors',
s = 30,
group = 'height category',
title = 'Number of Floors vs. Height',
yaxis_title = 'Number of Floors')
sky.where('height', are.above(300)) \
.scatter('height', 'floors',
s = 50,
labels = 'name',
title = 'Number of Floors vs. Height (Min. 300m)')
sky
name | city | latitude | longitude | floors | height | year | height category |
---|---|---|---|---|---|---|---|
One World Trade Center | New York City | 40.7131 | -74.0134 | 94 | 541.3 | 2014 | tall |
Central Park Tower | New York City | 40.7664 | -73.9809 | 95 | 541.02 | 2019 | tall |
Willis Tower | Chicago | 41.8789 | -87.6359 | 108 | 442.14 | 1974 | tall |
111 West 57th Street | New York City | 40.7648 | -73.9775 | 80 | 438.3 | 2018 | tall |
432 Park Avenue | New York City | 40.7616 | -73.9719 | 85 | 425.5 | 2015 | tall |
Trump International Hotel & Tower | Chicago | 41.8889 | -87.6264 | 98 | 423.22 | 2009 | tall |
30 Hudson Yards | New York City | 40.754 | -74.0008 | 73 | 386.61 | 2019 | tall |
Empire State Building | New York City | 40.7484 | -73.9856 | 102 | 381 | 1931 | tall |
Bank of America Tower | New York City | 40.7554 | -73.9844 | 55 | 365.8 | 2009 | tall |
Aon Center | Chicago | 41.8852 | -87.6215 | 83 | 346.26 | 1973 | tall |
... (2145 rows omitted)
sky.group('year')
year | count |
---|---|
0 | 7 |
1861 | 1 |
1888 | 1 |
1889 | 1 |
1892 | 1 |
1893 | 1 |
1895 | 2 |
1896 | 2 |
1897 | 1 |
1898 | 1 |
... (111 rows omitted)
This is obviously an error in our data.
sky.where('year', 0)
name | city | latitude | longitude | floors | height | year | height category |
---|---|---|---|---|---|---|---|
Miami River | Miami | 0 | 0 | 57 | 192.03 | 0 | tall |
461 Dean Street | New York City | 40.682 | -73.9755 | 32 | 105.77 | 0 | short |
640 North Wells Street | Chicago | 41.8936 | -87.6344 | 22 | 85.34 | 0 | short |
SkyHouse Channelside | Tampa | 27.9479 | -82.4469 | 23 | 83.82 | 0 | short |
SkyHouse Dallas | Dallas | 0 | 0 | 24 | 80 | 0 | short |
2950 North Sheridan Road | Chicago | 41.9359 | -87.6398 | 19 | 65.2 | 0 | short |
Anthem Blue Cross | Los Angeles | 34.1806 | -118.599 | 13 | 57.91 | 0 | short |
sky.where('year', are.not_equal_to(0)) \
.group('year') \
.plot('year', title = 'Number of Skyscrapers Built Per Year')
What if we want to look at the number of skyscrapers per year built in different cities?
sky.where('city', are.contained_in(['New York City', 'Chicago'])) \
.where('year', are.not_equal_to(0)) \
.pivot('city', 'year')
year | Chicago | New York City |
---|---|---|
1888 | 1 | 0 |
1889 | 1 | 0 |
1893 | 1 | 0 |
1895 | 1 | 1 |
1896 | 0 | 1 |
1899 | 1 | 2 |
1902 | 0 | 2 |
1903 | 1 | 2 |
1904 | 1 | 0 |
1905 | 0 | 1 |
... (100 rows omitted)
sky.where('city', are.contained_in(['New York City', 'Chicago'])) \
.where('year', are.not_equal_to(0)) \
.pivot('city', 'year') \
.plot('year',
title = 'Number of Skyscrapers Built Per Year in NYC and Chicago')
sky
name | city | latitude | longitude | floors | height | year | height category |
---|---|---|---|---|---|---|---|
One World Trade Center | New York City | 40.7131 | -74.0134 | 94 | 541.3 | 2014 | tall |
Central Park Tower | New York City | 40.7664 | -73.9809 | 95 | 541.02 | 2019 | tall |
Willis Tower | Chicago | 41.8789 | -87.6359 | 108 | 442.14 | 1974 | tall |
111 West 57th Street | New York City | 40.7648 | -73.9775 | 80 | 438.3 | 2018 | tall |
432 Park Avenue | New York City | 40.7616 | -73.9719 | 85 | 425.5 | 2015 | tall |
Trump International Hotel & Tower | Chicago | 41.8889 | -87.6264 | 98 | 423.22 | 2009 | tall |
30 Hudson Yards | New York City | 40.754 | -74.0008 | 73 | 386.61 | 2019 | tall |
Empire State Building | New York City | 40.7484 | -73.9856 | 102 | 381 | 1931 | tall |
Bank of America Tower | New York City | 40.7554 | -73.9844 | 55 | 365.8 | 2009 | tall |
Aon Center | Chicago | 41.8852 | -87.6215 | 83 | 346.26 | 1973 | tall |
... (2145 rows omitted)
Circle.map_table(sky.select('latitude', 'longitude'),
line_color = None,
fill_opacity = 0.65,
area = 75,
color = 'orange')
Let's look at a map of tall skyscrapers in New York City.
ny_tall = sky.where('city', 'New York City') \
.where('height category', 'tall') \
.select('latitude', 'longitude', 'name', 'height') \
.relabeled(['name', 'height'], ['labels', 'color_scale'])
ny_tall
latitude | longitude | labels | color_scale |
---|---|---|---|
40.7131 | -74.0134 | One World Trade Center | 541.3 |
40.7664 | -73.9809 | Central Park Tower | 541.02 |
40.7648 | -73.9775 | 111 West 57th Street | 438.3 |
40.7616 | -73.9719 | 432 Park Avenue | 425.5 |
40.754 | -74.0008 | 30 Hudson Yards | 386.61 |
40.7484 | -73.9856 | Empire State Building | 381 |
40.7554 | -73.9844 | Bank of America Tower | 365.8 |
40.7109 | -74.0116 | 3 World Trade Center | 328.88 |
40.7618 | -73.9782 | 53 West 53rd | 320.04 |
40.7516 | -73.9753 | Chrysler Building | 318.9 |
... (207 rows omitted)
Circle.map_table(ny_tall,
line_color = None,
fill_opacity = 0.65,
area = 150,
color_scale = None)
It seems like most skyscrapers in NYC are either in the financial district or in Midtown. The circles for One World Trade Center and the Empire State Building are bright.
Lastly, what if we want to look at where short and tall skyscrapers are throughout the country?
sky
name | city | latitude | longitude | floors | height | year | height category |
---|---|---|---|---|---|---|---|
One World Trade Center | New York City | 40.7131 | -74.0134 | 94 | 541.3 | 2014 | tall |
Central Park Tower | New York City | 40.7664 | -73.9809 | 95 | 541.02 | 2019 | tall |
Willis Tower | Chicago | 41.8789 | -87.6359 | 108 | 442.14 | 1974 | tall |
111 West 57th Street | New York City | 40.7648 | -73.9775 | 80 | 438.3 | 2018 | tall |
432 Park Avenue | New York City | 40.7616 | -73.9719 | 85 | 425.5 | 2015 | tall |
Trump International Hotel & Tower | Chicago | 41.8889 | -87.6264 | 98 | 423.22 | 2009 | tall |
30 Hudson Yards | New York City | 40.754 | -74.0008 | 73 | 386.61 | 2019 | tall |
Empire State Building | New York City | 40.7484 | -73.9856 | 102 | 381 | 1931 | tall |
Bank of America Tower | New York City | 40.7554 | -73.9844 | 55 | 365.8 | 2009 | tall |
Aon Center | Chicago | 41.8852 | -87.6215 | 83 | 346.26 | 1973 | tall |
... (2145 rows omitted)
There are two solutions here.
'short'
or 'tall'
and returns the desired color. (We did this in Lecture 28.)'short'
and 'tall'
and the other with the desired colors, and join this table with sky
.We will use the second approach here.
sky_to_color = Table().with_columns(
'category', np.array(['short', 'tall']),
'colors', np.array(['orange', 'green'])
)
sky_to_color
category | colors |
---|---|
short | orange |
tall | green |
sky_with_colors = sky.join('height category', sky_to_color, 'category') \
.select('latitude', 'longitude', 'colors')
sky_with_colors
latitude | longitude | colors |
---|---|---|
40.7624 | -73.9718 | orange |
25.8459 | -80.1197 | orange |
39.9516 | -75.1602 | orange |
37.7918 | -122.396 | orange |
44.9736 | -93.2763 | orange |
40.7792 | -73.9878 | orange |
25.7628 | -80.1926 | orange |
40.7439 | -73.9926 | orange |
33.765 | -84.3881 | orange |
40.7612 | -73.9788 | orange |
... (2145 rows omitted)
Circle.map_table(sky_with_colors,
line_color = None,
fill_opacity = 0.7)
While there seem to be short skyscrapers (orange) throughout the country, tall skyscrapers generally seem to be concentrated in larger cities.