# Importing all of our tools
import numpy as np
from datascience import *
from math import *
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import plotly.express as px
Table.interactive_plots()
Run the next cell to import all of our datasets, obtained from https://www.electionreturns.pa.gov/ and https://www.truckads.com/.
pa_2020 = Table().read_table("data/2020_presidential.CSV")
pa_2020.show(5)
Election Name | County Name | Office Name | District Name | Party Name | Candidate Name | Votes | Yes Votes | No Votes | Election Day Votes | ElectionDay Yes Votes | Election Day No Votes | Mail Votes | Mail Yes Votes | Mail No Votes | Provisional Votes | Provisional Yes Votes | Provisional No Votes |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2020 Presidential Election | ADAMS | President of the United States | Statewide | Democratic | BIDEN, JOSEPH ROBINETTE JR | 18,207 | 0 | 0 | 6,611 | 0 | 0 | 11,436 | 0 | 0 | 160 | 0 | 0 |
2020 Presidential Election | ADAMS | President of the United States | Statewide | Republican | TRUMP, DONALD J. | 37,523 | 0 | 0 | 29,942 | 0 | 0 | 7,148 | 0 | 0 | 433 | 0 | 0 |
2020 Presidential Election | ADAMS | President of the United States | Statewide | Libertarian | JORGENSEN, JO | 810 | 0 | 0 | 544 | 0 | 0 | 256 | 0 | 0 | 10 | 0 | 0 |
2020 Presidential Election | ALLEGHENY | President of the United States | Statewide | Democratic | BIDEN, JOSEPH ROBINETTE JR | 429,065 | 0 | 0 | 148,171 | 0 | 0 | 273,080 | 0 | 0 | 7,814 | 0 | 0 |
2020 Presidential Election | ALLEGHENY | President of the United States | Statewide | Republican | TRUMP, DONALD J. | 282,324 | 0 | 0 | 209,459 | 0 | 0 | 66,575 | 0 | 0 | 6,290 | 0 | 0 |
... (196 rows omitted)
To make this easier, let's cut it down to the columns we want. (For this section, don't re-run cells! You'll get an error.)
cols_of_interest = ["County Name", "Party Name", "Candidate Name", "Votes", "Election Day Votes", "Mail Votes", "Provisional Votes"]
pa_2020 = pa_2020.select(cols_of_interest)
pa_2020
County Name | Party Name | Candidate Name | Votes | Election Day Votes | Mail Votes | Provisional Votes |
---|---|---|---|---|---|---|
ADAMS | Democratic | BIDEN, JOSEPH ROBINETTE JR | 18,207 | 6,611 | 11,436 | 160 |
ADAMS | Republican | TRUMP, DONALD J. | 37,523 | 29,942 | 7,148 | 433 |
ADAMS | Libertarian | JORGENSEN, JO | 810 | 544 | 256 | 10 |
ALLEGHENY | Democratic | BIDEN, JOSEPH ROBINETTE JR | 429,065 | 148,171 | 273,080 | 7,814 |
ALLEGHENY | Republican | TRUMP, DONALD J. | 282,324 | 209,459 | 66,575 | 6,290 |
ALLEGHENY | Libertarian | JORGENSEN, JO | 8,344 | 5,085 | 3,076 | 183 |
ARMSTRONG | Democratic | BIDEN, JOSEPH ROBINETTE JR | 8,457 | 4,275 | 4,108 | 74 |
ARMSTRONG | Republican | TRUMP, DONALD J. | 27,489 | 24,233 | 2,917 | 339 |
ARMSTRONG | Libertarian | JORGENSEN, JO | 424 | 340 | 78 | 6 |
BEAVER | Democratic | BIDEN, JOSEPH ROBINETTE JR | 38,122 | 17,304 | 20,153 | 665 |
... (191 rows omitted)
# What was the voter turn out (# of voters who voted) in 2020? Try calculating it.
total_votes = sum(pa_2020.column("Votes"))
total_votes
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) /tmp/ipykernel_112/1346453950.py in <module> 1 # What was the voter turn out (# of voters who voted) in 2020? Try calculating it. ----> 2 total_votes = sum(pa_2020.column("Votes")) 3 total_votes TypeError: unsupported operand type(s) for +: 'int' and 'numpy.str_'
# What's the problem? Let's check the data types.
pa_2020.column("Votes").item(0)
'18,207'
# It looks like we need to do some data cleaning. How do we convert this into a number we can work with?
adamsctybiden = pa_2020.column("Votes").item(0)
# strings are a specific object with methods associated
# remember we can "cast" a datatype to a int
int(adamsctybiden.replace(",", ""))
18207
# Now, how do we do this to all of the numbers in the dataset? We need a function.
def vote_to_int(votes):
"""Given a string of votes, convert to an integer"""
return int(votes.replace(",", ""))
vote_to_int(adamsctybiden)
18207
# To do a lot of conversions, we can apply this to the data in the table!
votes_as_int = pa_2020.apply(vote_to_int, "Votes")
sum(votes_as_int) # Now we can do math!
6915283
# NOTE: We can do the same thing with a for loop:
looped_ver = np.array([])
for vote in pa_2020.column("Votes"):
looped_ver = np.append(looped_ver, vote_to_int(vote))
looped_ver
array([1.82070e+04, 3.75230e+04, 8.10000e+02, 4.29065e+05, 2.82324e+05, 8.34400e+03, 8.45700e+03, 2.74890e+04, 4.24000e+02, 3.81220e+04, 5.47590e+04, 1.24100e+03, 4.36700e+03, 2.30250e+04, 1.82000e+02, 9.28950e+04, 1.09736e+05, 2.90900e+03, 1.76360e+04, 4.53060e+04, 6.53000e+02, 8.04600e+03, 2.16000e+04, 5.13000e+02, 2.04712e+05, 1.87367e+05, 4.15500e+03, 3.75080e+04, 7.43590e+04, 1.43800e+03, 2.17300e+04, 4.80850e+04, 7.59000e+02, 6.34000e+02, 1.77100e+03, 2.90000e+01, 1.12120e+04, 2.19840e+04, 4.33000e+02, 4.00550e+04, 3.63720e+04, 1.06600e+03, 1.82372e+05, 1.28565e+05, 3.56500e+03, 4.67800e+03, 1.45780e+04, 2.37000e+02, 9.67300e+03, 2.92030e+04, 5.46000e+02, 5.50200e+03, 1.19020e+04, 2.21000e+02, 1.05320e+04, 2.00980e+04, 5.41000e+02, 1.29240e+04, 2.85590e+04, 5.21000e+02, 6.22450e+04, 7.72120e+04, 2.13800e+03, 7.89830e+04, 6.64080e+04, 1.97700e+03, 2.06423e+05, 1.18532e+05, 2.97600e+03, 4.52200e+03, 1.21400e+04, 2.44000e+02, 6.82860e+04, 6.68690e+04, 1.92800e+03, 2.04440e+04, 4.12270e+04, 4.68000e+02, 7.28000e+02, 1.88200e+03, 3.60000e+01, 2.24220e+04, 5.72450e+04, 1.11600e+03, 1.08500e+03, 6.82400e+03, 6.80000e+01, 4.91100e+03, 1.25790e+04, 1.79000e+02, 5.44500e+03, 1.70610e+04, 2.86000e+02, 1.26340e+04, 2.80890e+04, 4.75000e+02, 4.52700e+03, 1.79600e+04, 3.37000e+02, 2.25300e+03, 9.64900e+03, 1.41000e+02, 6.19910e+04, 5.23340e+04, 1.08500e+03, 1.15847e+05, 1.60209e+05, 4.18300e+03, 1.59780e+04, 2.95970e+04, 5.01000e+02, 2.39320e+04, 4.67310e+04, 9.89000e+02, 9.82880e+04, 8.42590e+04, 2.16600e+03, 6.48730e+04, 8.69290e+04, 1.51900e+03, 1.69710e+04, 4.14620e+04, 8.21000e+02, 5.09800e+03, 1.40830e+04, 2.85000e+02, 2.10670e+04, 3.61430e+04, 7.44000e+02, 4.60300e+03, 1.66700e+04, 2.29000e+02, 4.40600e+04, 3.87260e+04, 1.04300e+03, 3.19511e+05, 1.85460e+05, 5.18600e+03, 3.77100e+03, 5.84400e+03, 1.56000e+02, 8.50870e+04, 8.38540e+04, 2.00100e+03, 1.26770e+04, 2.89520e+04, 6.54000e+02, 5.95000e+03, 1.82930e+04, 4.09000e+02, 6.03790e+05, 1.32740e+05, 4.84700e+03, 1.30190e+04, 1.92130e+04, 3.22000e+02, 1.72600e+03, 7.23900e+03, 9.90000e+01, 2.07270e+04, 4.88710e+04, 1.00500e+03, 4.91000e+03, 1.39830e+04, 2.47000e+02, 8.65400e+03, 3.14660e+04, 4.23000e+02, 9.21000e+02, 2.61900e+03, 5.50000e+01, 6.23600e+03, 1.52070e+04, 3.09000e+02, 4.95500e+03, 1.57420e+04, 3.78000e+02, 7.47500e+03, 1.23560e+04, 2.84000e+02, 7.58500e+03, 1.85690e+04, 3.74000e+02, 6.06600e+03, 1.42370e+04, 3.47000e+02, 4.50880e+04, 7.20800e+04, 1.31000e+03, 9.19100e+03, 1.86370e+04, 2.61000e+02, 7.21290e+04, 1.30218e+05, 2.35000e+03, 4.70400e+03, 9.93600e+03, 2.18000e+02, 8.81140e+04, 1.46733e+05, 3.62400e+03])
# Now, we need to convert all string data into integer data
cols_to_convert = ["Votes", "Election Day Votes", "Mail Votes", "Provisional Votes"]
for col in cols_to_convert:
votes_as_ints = pa_2020.apply(vote_to_int, col)
pa_2020 = pa_2020.with_column(col, votes_as_ints)
pa_2020
County Name | Party Name | Candidate Name | Votes | Election Day Votes | Mail Votes | Provisional Votes |
---|---|---|---|---|---|---|
ADAMS | Democratic | BIDEN, JOSEPH ROBINETTE JR | 18207 | 6611 | 11436 | 160 |
ADAMS | Republican | TRUMP, DONALD J. | 37523 | 29942 | 7148 | 433 |
ADAMS | Libertarian | JORGENSEN, JO | 810 | 544 | 256 | 10 |
ALLEGHENY | Democratic | BIDEN, JOSEPH ROBINETTE JR | 429065 | 148171 | 273080 | 7814 |
ALLEGHENY | Republican | TRUMP, DONALD J. | 282324 | 209459 | 66575 | 6290 |
ALLEGHENY | Libertarian | JORGENSEN, JO | 8344 | 5085 | 3076 | 183 |
ARMSTRONG | Democratic | BIDEN, JOSEPH ROBINETTE JR | 8457 | 4275 | 4108 | 74 |
ARMSTRONG | Republican | TRUMP, DONALD J. | 27489 | 24233 | 2917 | 339 |
ARMSTRONG | Libertarian | JORGENSEN, JO | 424 | 340 | 78 | 6 |
BEAVER | Democratic | BIDEN, JOSEPH ROBINETTE JR | 38122 | 17304 | 20153 | 665 |
... (191 rows omitted)
# Checking data types again, just in case
type(pa_2020.column("Mail Votes").item(0))
int
# Looks good! Now, as someone interested in media, let's add in some info about TV media markets
pa_media_markets = Table().read_table("data/media_markets.csv")
pa_media_markets.show(5)
Media Market | County |
---|---|
Erie | Crawford |
Erie | Erie |
Erie | McKean |
Erie | Warren |
Harrisburg Lancaster Lebanon York | Adams |
... (62 rows omitted)
# I want this information attached to pa_2020
# We'll learn this tomorrow: we can "join" multiple tables by a specific column
# One issue: the data needs to be the same across tables to work,
# so the upper case vs capitalized counties will give us an error
mm_counties = pa_media_markets.column("County")
cleaned_counties = make_array()
for county in mm_counties:
clean_county = county.upper() # Converting counties into upper case to make this work
cleaned_counties = np.append(cleaned_counties, clean_county)
cleaned_counties
array(['CRAWFORD', 'ERIE', 'MCKEAN', 'WARREN', 'ADAMS', 'CUMBERLAND', 'DAUPHIN', 'FRANKLIN', 'JUNIATA', 'LANCASTER', 'LEBANON', 'PERRY', 'YORK', 'BEDFORD', 'BLAIR', 'CAMBRIA', 'CAMERON', 'CENTRE', 'CLEARFIELD', 'ELK', 'FULTON', 'HUNTINGDON', 'JEFFERSON', 'MIFFLIN', 'SOMERSET', 'BERKS', 'BUCKS', 'CHESTER', 'DELAWARE', 'LEHIGH', 'MONTGOMERY', 'NORTHAMPTON', 'PHILADELPHIA', 'ALLEGHENY', 'ARMSTRONG', 'BEAVER', 'BUTLER', 'CLARION', 'FAYETTE', 'FOREST', 'GREENE', 'INDIANA', 'LAWRENCE', 'MERCER', 'VENANGO', 'WASHINGTON', 'WESTMORELAND', 'BRADFORD', 'CARBON', 'CLINTON', 'COLUMBIA', 'LACKAWANNA', 'LUZERNE', 'LYCOMING', 'MONROE', 'MONTOUR', 'NORTHUMBERLAND', 'PIKE', 'POTTER', 'SCHUYLKILL', 'SNYDER', 'SULLIVAN', 'SUSQUEHANNA', 'TIOGA', 'UNION', 'WAYNE', 'WYOMING'], dtype='<U32')
# Cleaning up the media markets table with the new names
pa_media_markets = pa_media_markets.with_column("County", cleaned_counties)
pa_media_markets
Media Market | County |
---|---|
Erie | CRAWFORD |
Erie | ERIE |
Erie | MCKEAN |
Erie | WARREN |
Harrisburg Lancaster Lebanon York | ADAMS |
Harrisburg Lancaster Lebanon York | CUMBERLAND |
Harrisburg Lancaster Lebanon York | DAUPHIN |
Harrisburg Lancaster Lebanon York | FRANKLIN |
Harrisburg Lancaster Lebanon York | JUNIATA |
Harrisburg Lancaster Lebanon York | LANCASTER |
... (57 rows omitted)
# Finally, combining all of our data with a join
pa_2020_mm = pa_2020.join("County Name", pa_media_markets, "County")
pa_2020_mm
County Name | Party Name | Candidate Name | Votes | Election Day Votes | Mail Votes | Provisional Votes | Media Market |
---|---|---|---|---|---|---|---|
ADAMS | Democratic | BIDEN, JOSEPH ROBINETTE JR | 18207 | 6611 | 11436 | 160 | Harrisburg Lancaster Lebanon York |
ADAMS | Republican | TRUMP, DONALD J. | 37523 | 29942 | 7148 | 433 | Harrisburg Lancaster Lebanon York |
ADAMS | Libertarian | JORGENSEN, JO | 810 | 544 | 256 | 10 | Harrisburg Lancaster Lebanon York |
ALLEGHENY | Democratic | BIDEN, JOSEPH ROBINETTE JR | 429065 | 148171 | 273080 | 7814 | Pittsburgh |
ALLEGHENY | Republican | TRUMP, DONALD J. | 282324 | 209459 | 66575 | 6290 | Pittsburgh |
ALLEGHENY | Libertarian | JORGENSEN, JO | 8344 | 5085 | 3076 | 183 | Pittsburgh |
ARMSTRONG | Democratic | BIDEN, JOSEPH ROBINETTE JR | 8457 | 4275 | 4108 | 74 | Pittsburgh |
ARMSTRONG | Republican | TRUMP, DONALD J. | 27489 | 24233 | 2917 | 339 | Pittsburgh |
ARMSTRONG | Libertarian | JORGENSEN, JO | 424 | 340 | 78 | 6 | Pittsburgh |
BEAVER | Democratic | BIDEN, JOSEPH ROBINETTE JR | 38122 | 17304 | 20153 | 665 | Pittsburgh |
... (188 rows omitted)
Our questions for this analysis:
How did each party vote? Remember that the GOP was very against mail-in-ballots in 2020.
What media markets supported each party in 2020? Pennsylvania is going through a demographic and economic shift, so this may have insights for 2022 or 2024.
Let's learn more about where voters voted and how they voted. We're going to use group
and pivot
for this.
Recall: tbl.group("col", func)
If func is not specified, by default finds the count of each unique value in "col". Otherwise, applies func to the grouped values in every other column.
tbl.pivot("col", "row", "vals", func)
cross-classifies a dataset, making all the unique values in 1 column the new rows and all the unique values in the other column the new column labels. Then, it puts the values of "vals", with the function applied to each group, in the corresponding cells.
For example: http://data8.org/interactive_table_functions/
# Now we can finally begin a quick analysis using interesting table methods
# First, we can use a group to quickly quantify the # of counties in each media market
fixed_counts = pa_2020_mm.group("Media Market").column("count") / 3
# The dataset has 3 entries per Media Market, so divide by 3 to fix that
pa_2020_mm.group("Media Market").with_column("count", fixed_counts)
Media Market | count |
---|---|
Erie | 3 |
Harrisburg Lancaster Lebanon York | 9 |
Johnstown Altoona | 12 |
Philadelphia | 8 |
Pittsburgh | 14 |
Wilkes Barre Scranton | 20 |
# How many voters are in each media market?
# Notice that we need to clean the resultant table a bit by dropping extra columns
pa_2020_mm.group("Media Market", sum).drop(1, 2, 3)
Media Market | Votes sum | Election Day Votes sum | Mail Votes sum | Provisional Votes sum |
---|---|---|---|---|
Erie | 199737 | 132314 | 64332 | 3091 |
Harrisburg Lancaster Lebanon York | 1053343 | 695025 | 340934 | 17384 |
Johnstown Altoona | 413636 | 301268 | 107983 | 4322 |
Philadelphia | 2851396 | 1536882 | 1271094 | 43420 |
Pittsburgh | 1560408 | 957033 | 577251 | 26124 |
Wilkes Barre Scranton | 817297 | 556079 | 250240 | 10978 |
sum(pa_2020_mm.where("Media Market", "Erie").column("Votes"))
199737
# We can also use a double group to quantify by both media market and party
pa_2020_mm.group(["Media Market", "Party Name"], sum).drop(2, 3)
/opt/conda/lib/python3.8/site-packages/datascience/tables.py:920: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray values = np.array(tuple(values))
Media Market | Party Name | Votes sum | Election Day Votes sum | Mail Votes sum | Provisional Votes sum |
---|---|---|---|---|---|
Erie | Democratic | 87276 | 39620 | 46372 | 1284 |
Erie | Libertarian | 2796 | 1969 | 789 | 38 |
Erie | Republican | 109665 | 90725 | 17171 | 1769 |
Harrisburg Lancaster Lebanon York | Democratic | 417953 | 176480 | 234112 | 7361 |
Harrisburg Lancaster Lebanon York | Libertarian | 15387 | 10631 | 4438 | 318 |
Harrisburg Lancaster Lebanon York | Republican | 620003 | 507914 | 102384 | 9705 |
Johnstown Altoona | Democratic | 122931 | 54255 | 67480 | 1167 |
Johnstown Altoona | Libertarian | 4822 | 3380 | 1383 | 57 |
Johnstown Altoona | Republican | 285883 | 243633 | 39120 | 3098 |
Philadelphia | Democratic | 1793078 | 716629 | 1048866 | 27583 |
... (8 rows omitted)
# In general, how did each party vote? (Election Day, Mail Votes, Provisional)
# Let's figure it out.
party_by_votes = pa_2020_mm.group("Party Name", sum).drop([1, 2, 7])
party_by_votes
Party Name | Votes sum | Election Day Votes sum | Mail Votes sum | Provisional Votes sum |
---|---|---|---|---|
Democratic | 3453131 | 1406587 | 1993347 | 53168 |
Libertarian | 79095 | 53095 | 24721 | 1277 |
Republican | 3363591 | 2718919 | 593766 | 50874 |
# Let's convert all of the columns into a proportion using array arithmetic; just run this cell
# This makes it a bit easier to compare across party by controlling by # of votes
def vote_prop(col_str):
return party_by_votes.column(col_str) / party_by_votes.column("Votes sum")
party_vote_props = party_by_votes.select("Party Name").with_columns("Election Day Votes", vote_prop("Election Day Votes sum"),
"Mail Votes", vote_prop("Mail Votes sum"),
"Provisional Votes", vote_prop("Provisional Votes sum"))
# Bar chart to see the breakdown!
party_vote_props.barh(0)
# Reminder of the table setup
pa_2020_mm.show(5)
County Name | Party Name | Candidate Name | Votes | Election Day Votes | Mail Votes | Provisional Votes | Media Market |
---|---|---|---|---|---|---|---|
ADAMS | Democratic | BIDEN, JOSEPH ROBINETTE JR | 18207 | 6611 | 11436 | 160 | Harrisburg Lancaster Lebanon York |
ADAMS | Republican | TRUMP, DONALD J. | 37523 | 29942 | 7148 | 433 | Harrisburg Lancaster Lebanon York |
ADAMS | Libertarian | JORGENSEN, JO | 810 | 544 | 256 | 10 | Harrisburg Lancaster Lebanon York |
ALLEGHENY | Democratic | BIDEN, JOSEPH ROBINETTE JR | 429065 | 148171 | 273080 | 7814 | Pittsburgh |
ALLEGHENY | Republican | TRUMP, DONALD J. | 282324 | 209459 | 66575 | 6290 | Pittsburgh |
... (193 rows omitted)
Now let's try "cross classifying"; this is similar to a 2 column group, but let's focus on a specific question:
How were the votes broken down by Media Market and party?
Or, in other words, what media markets provided most of the raw votes come from for each party? This is useful information because electoral college votes are based on state totals (i.e. it doesn't matter if Johnstown Altoona is very Republican if there aren't many voters there).
# Pivots the table so we have parties as column labels (arg 1), media markets as rows (arg 2), and
# vote totals as cell values (arg 3); we aggregate vote totals with sum (arg 4)
market_vs_party = pa_2020_mm.pivot("Party Name", "Media Market", "Votes", sum)
market_vs_party
/opt/conda/lib/python3.8/site-packages/datascience/tables.py:920: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
Media Market | Democratic | Libertarian | Republican |
---|---|---|---|
Erie | 87276 | 2796 | 109665 |
Harrisburg Lancaster Lebanon York | 417953 | 15387 | 620003 |
Johnstown Altoona | 122931 | 4822 | 285883 |
Philadelphia | 1793078 | 27805 | 1030513 |
Pittsburgh | 718394 | 18121 | 823893 |
Wilkes Barre Scranton | 313499 | 10164 | 493634 |
# Let's look at the data with a bar chart
# What does this tell us?
market_vs_party.barh(0)
# Now, what if I wanted to figure out the preferred party for each media market?
# Just run this cell; it's a lot of wordy array arithmetic: we're converting each party to proportions
market_vs_party.column("Democratic") / (market_vs_party.column("Democratic") + market_vs_party.column("Libertarian") + market_vs_party.column("Republican"))
mvp_props = market_vs_party.select("Media Market").with_columns("Democratic", market_vs_party.column("Democratic") / (market_vs_party.column("Democratic") + market_vs_party.column("Libertarian") + market_vs_party.column("Republican")),
"Libertarian", market_vs_party.column("Libertarian") / (market_vs_party.column("Democratic") + market_vs_party.column("Libertarian") + market_vs_party.column("Republican")),
"Republican", market_vs_party.column("Republican") / (market_vs_party.column("Democratic") + market_vs_party.column("Libertarian") + market_vs_party.column("Republican")))
# Now that we've controlled for number of votes in media market, this shows us who "won" each media market
mvp_props.barh(0)
If you're interested, check out this notebook by Ian that gives us more insight into the state by looking at more elections data.
You'll notice that the linked notebook uses a different syntax -- that's because it uses pandas
instead of the datascience
library that we're used to!
The datascience
library is great because it's simple and easier to learn, but has a lot of limitations and requires a lot of work to do just this level of cleaning and EDA. You can learn in further data science classes about pandas
, which is a lot more powerful, concise, and lets us do even more interesting analysis with data tables.