# Importing all of our tools
import numpy as np
from datascience import *
from math import *

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import plotly.express as px
Table.interactive_plots()


pa_2020 = Table().read_table("data/2020_presidential.CSV")
pa_2020.show(5)


cols_of_interest = ["County Name", "Party Name", "Candidate Name", "Votes", "Election Day Votes", "Mail Votes", "Provisional Votes"]

pa_2020 = pa_2020.select(cols_of_interest)
pa_2020


# What was the voter turn out (# of voters who voted) in 2020? Try calculating it.
total_votes = sum(pa_2020.column("Votes"))
total_votes

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/tmp/ipykernel_112/1346453950.py in <module>
      1 # What was the voter turn out (# of voters who voted) in 2020? Try calculating it.
----> 2 total_votes = sum(pa_2020.column("Votes"))
      3 total_votes

TypeError: unsupported operand type(s) for +: 'int' and 'numpy.str_'


# What's the problem? Let's check the data types.
pa_2020.column("Votes").item(0)

'18,207'


# It looks like we need to do some data cleaning. How do we convert this into a number we can work with?
adamsctybiden = pa_2020.column("Votes").item(0)

# strings are a specific object with methods associated
# remember we can "cast" a datatype to a int
int(adamsctybiden.replace(",", ""))

18207


# Now, how do we do this to all of the numbers in the dataset? We need a function.

def vote_to_int(votes):
    """Given a string of votes, convert to an integer"""
    return int(votes.replace(",", "")) 

vote_to_int(adamsctybiden)

18207


# To do a lot of conversions, we can apply this to the data in the table!

votes_as_int = pa_2020.apply(vote_to_int, "Votes")

sum(votes_as_int) # Now we can do math!

6915283


# NOTE: We can do the same thing with a for loop:
looped_ver = np.array([])

for vote in pa_2020.column("Votes"):
    looped_ver = np.append(looped_ver, vote_to_int(vote))

looped_ver

array([1.82070e+04, 3.75230e+04, 8.10000e+02, 4.29065e+05, 2.82324e+05,
       8.34400e+03, 8.45700e+03, 2.74890e+04, 4.24000e+02, 3.81220e+04,
       5.47590e+04, 1.24100e+03, 4.36700e+03, 2.30250e+04, 1.82000e+02,
       9.28950e+04, 1.09736e+05, 2.90900e+03, 1.76360e+04, 4.53060e+04,
       6.53000e+02, 8.04600e+03, 2.16000e+04, 5.13000e+02, 2.04712e+05,
       1.87367e+05, 4.15500e+03, 3.75080e+04, 7.43590e+04, 1.43800e+03,
       2.17300e+04, 4.80850e+04, 7.59000e+02, 6.34000e+02, 1.77100e+03,
       2.90000e+01, 1.12120e+04, 2.19840e+04, 4.33000e+02, 4.00550e+04,
       3.63720e+04, 1.06600e+03, 1.82372e+05, 1.28565e+05, 3.56500e+03,
       4.67800e+03, 1.45780e+04, 2.37000e+02, 9.67300e+03, 2.92030e+04,
       5.46000e+02, 5.50200e+03, 1.19020e+04, 2.21000e+02, 1.05320e+04,
       2.00980e+04, 5.41000e+02, 1.29240e+04, 2.85590e+04, 5.21000e+02,
       6.22450e+04, 7.72120e+04, 2.13800e+03, 7.89830e+04, 6.64080e+04,
       1.97700e+03, 2.06423e+05, 1.18532e+05, 2.97600e+03, 4.52200e+03,
       1.21400e+04, 2.44000e+02, 6.82860e+04, 6.68690e+04, 1.92800e+03,
       2.04440e+04, 4.12270e+04, 4.68000e+02, 7.28000e+02, 1.88200e+03,
       3.60000e+01, 2.24220e+04, 5.72450e+04, 1.11600e+03, 1.08500e+03,
       6.82400e+03, 6.80000e+01, 4.91100e+03, 1.25790e+04, 1.79000e+02,
       5.44500e+03, 1.70610e+04, 2.86000e+02, 1.26340e+04, 2.80890e+04,
       4.75000e+02, 4.52700e+03, 1.79600e+04, 3.37000e+02, 2.25300e+03,
       9.64900e+03, 1.41000e+02, 6.19910e+04, 5.23340e+04, 1.08500e+03,
       1.15847e+05, 1.60209e+05, 4.18300e+03, 1.59780e+04, 2.95970e+04,
       5.01000e+02, 2.39320e+04, 4.67310e+04, 9.89000e+02, 9.82880e+04,
       8.42590e+04, 2.16600e+03, 6.48730e+04, 8.69290e+04, 1.51900e+03,
       1.69710e+04, 4.14620e+04, 8.21000e+02, 5.09800e+03, 1.40830e+04,
       2.85000e+02, 2.10670e+04, 3.61430e+04, 7.44000e+02, 4.60300e+03,
       1.66700e+04, 2.29000e+02, 4.40600e+04, 3.87260e+04, 1.04300e+03,
       3.19511e+05, 1.85460e+05, 5.18600e+03, 3.77100e+03, 5.84400e+03,
       1.56000e+02, 8.50870e+04, 8.38540e+04, 2.00100e+03, 1.26770e+04,
       2.89520e+04, 6.54000e+02, 5.95000e+03, 1.82930e+04, 4.09000e+02,
       6.03790e+05, 1.32740e+05, 4.84700e+03, 1.30190e+04, 1.92130e+04,
       3.22000e+02, 1.72600e+03, 7.23900e+03, 9.90000e+01, 2.07270e+04,
       4.88710e+04, 1.00500e+03, 4.91000e+03, 1.39830e+04, 2.47000e+02,
       8.65400e+03, 3.14660e+04, 4.23000e+02, 9.21000e+02, 2.61900e+03,
       5.50000e+01, 6.23600e+03, 1.52070e+04, 3.09000e+02, 4.95500e+03,
       1.57420e+04, 3.78000e+02, 7.47500e+03, 1.23560e+04, 2.84000e+02,
       7.58500e+03, 1.85690e+04, 3.74000e+02, 6.06600e+03, 1.42370e+04,
       3.47000e+02, 4.50880e+04, 7.20800e+04, 1.31000e+03, 9.19100e+03,
       1.86370e+04, 2.61000e+02, 7.21290e+04, 1.30218e+05, 2.35000e+03,
       4.70400e+03, 9.93600e+03, 2.18000e+02, 8.81140e+04, 1.46733e+05,
       3.62400e+03])


# Now, we need to convert all string data into integer data
cols_to_convert = ["Votes", "Election Day Votes", "Mail Votes", "Provisional Votes"]

for col in cols_to_convert:
    votes_as_ints = pa_2020.apply(vote_to_int, col)
    pa_2020 = pa_2020.with_column(col, votes_as_ints)
    
pa_2020


# Checking data types again, just in case
type(pa_2020.column("Mail Votes").item(0))

int


# Looks good! Now, as someone interested in media, let's add in some info about TV media markets
pa_media_markets = Table().read_table("data/media_markets.csv")
pa_media_markets.show(5)


# I want this information attached to pa_2020
# We'll learn this tomorrow: we can "join" multiple tables by a specific column
# One issue: the data needs to be the same across tables to work,
# so the upper case vs capitalized counties will give us an error

mm_counties = pa_media_markets.column("County")
cleaned_counties = make_array()

for county in mm_counties:
    clean_county = county.upper() # Converting counties into upper case to make this work
    cleaned_counties = np.append(cleaned_counties, clean_county)

cleaned_counties

array(['CRAWFORD', 'ERIE', 'MCKEAN', 'WARREN', 'ADAMS', 'CUMBERLAND',
       'DAUPHIN', 'FRANKLIN', 'JUNIATA', 'LANCASTER', 'LEBANON', 'PERRY',
       'YORK', 'BEDFORD', 'BLAIR', 'CAMBRIA', 'CAMERON', 'CENTRE',
       'CLEARFIELD', 'ELK', 'FULTON', 'HUNTINGDON', 'JEFFERSON',
       'MIFFLIN', 'SOMERSET', 'BERKS', 'BUCKS', 'CHESTER', 'DELAWARE',
       'LEHIGH', 'MONTGOMERY', 'NORTHAMPTON', 'PHILADELPHIA', 'ALLEGHENY',
       'ARMSTRONG', 'BEAVER', 'BUTLER', 'CLARION', 'FAYETTE', 'FOREST',
       'GREENE', 'INDIANA', 'LAWRENCE', 'MERCER', 'VENANGO', 'WASHINGTON',
       'WESTMORELAND', 'BRADFORD', 'CARBON', 'CLINTON', 'COLUMBIA',
       'LACKAWANNA', 'LUZERNE', 'LYCOMING', 'MONROE', 'MONTOUR',
       'NORTHUMBERLAND', 'PIKE', 'POTTER', 'SCHUYLKILL', 'SNYDER',
       'SULLIVAN', 'SUSQUEHANNA', 'TIOGA', 'UNION', 'WAYNE', 'WYOMING'],
      dtype='<U32')


# Cleaning up the media markets table with the new names
pa_media_markets = pa_media_markets.with_column("County", cleaned_counties)
pa_media_markets


# Finally, combining all of our data with a join
pa_2020_mm = pa_2020.join("County Name", pa_media_markets, "County")
pa_2020_mm


# Now we can finally begin a quick analysis using interesting table methods
# First, we can use a group to quickly quantify the # of counties in each media market

fixed_counts = pa_2020_mm.group("Media Market").column("count") / 3 
# The dataset has 3 entries per Media Market, so divide by 3 to fix that

pa_2020_mm.group("Media Market").with_column("count", fixed_counts)


# How many voters are in each media market? 
# Notice that we need to clean the resultant table a bit by dropping extra columns

pa_2020_mm.group("Media Market", sum).drop(1, 2, 3)


sum(pa_2020_mm.where("Media Market", "Erie").column("Votes"))

199737


# We can also use a double group to quantify by both media market and party
pa_2020_mm.group(["Media Market", "Party Name"], sum).drop(2, 3)

/opt/conda/lib/python3.8/site-packages/datascience/tables.py:920: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  values = np.array(tuple(values))


# In general, how did each party vote? (Election Day, Mail Votes, Provisional)
# Let's figure it out. 

party_by_votes = pa_2020_mm.group("Party Name", sum).drop([1, 2, 7])
party_by_votes


# Let's convert all of the columns into a proportion using array arithmetic; just run this cell
# This makes it a bit easier to compare across party by controlling by # of votes

def vote_prop(col_str):
    return party_by_votes.column(col_str) / party_by_votes.column("Votes sum")

party_vote_props = party_by_votes.select("Party Name").with_columns("Election Day Votes", vote_prop("Election Day Votes sum"),
                                                                   "Mail Votes", vote_prop("Mail Votes sum"),
                                                                   "Provisional Votes", vote_prop("Provisional Votes sum"))

# Bar chart to see the breakdown! 
party_vote_props.barh(0)


# Reminder of the table setup
pa_2020_mm.show(5)


# Pivots the table so we have parties as column labels (arg 1), media markets as rows (arg 2), and
# vote totals as cell values (arg 3); we aggregate vote totals with sum (arg 4)
market_vs_party = pa_2020_mm.pivot("Party Name", "Media Market", "Votes", sum)
market_vs_party

/opt/conda/lib/python3.8/site-packages/datascience/tables.py:920: VisibleDeprecationWarning:

Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray


# Let's look at the data with a bar chart
# What does this tell us?
market_vs_party.barh(0)


# Now, what if I wanted to figure out the preferred party for each media market?
# Just run this cell; it's a lot of wordy array arithmetic: we're converting each party to proportions

market_vs_party.column("Democratic") / (market_vs_party.column("Democratic") + market_vs_party.column("Libertarian") + market_vs_party.column("Republican"))

mvp_props = market_vs_party.select("Media Market").with_columns("Democratic", market_vs_party.column("Democratic") / (market_vs_party.column("Democratic") + market_vs_party.column("Libertarian") + market_vs_party.column("Republican")),
                                                                "Libertarian", market_vs_party.column("Libertarian") / (market_vs_party.column("Democratic") + market_vs_party.column("Libertarian") + market_vs_party.column("Republican")),
                                                                "Republican", market_vs_party.column("Republican") / (market_vs_party.column("Democratic") + market_vs_party.column("Libertarian") + market_vs_party.column("Republican")))

# Now that we've controlled for number of votes in media market, this shows us who "won" each media market
mvp_props.barh(0)

Election Name	County Name	Office Name	District Name	Party Name	Candidate Name	Votes	Election Day Votes	Mail Votes	Provisional Votes
2020 Presidential Election	ADAMS	President of the United States	Statewide	Democratic	BIDEN, JOSEPH ROBINETTE JR	18,207	6,611	11,436	160
2020 Presidential Election	ADAMS	President of the United States	Statewide	Republican	TRUMP, DONALD J.	37,523	29,942	7,148	433
2020 Presidential Election	ADAMS	President of the United States	Statewide	Libertarian	JORGENSEN, JO	810	544	256	10
2020 Presidential Election	ALLEGHENY	President of the United States	Statewide	Democratic	BIDEN, JOSEPH ROBINETTE JR	429,065	148,171	273,080	7,814
2020 Presidential Election	ALLEGHENY	President of the United States	Statewide	Republican	TRUMP, DONALD J.	282,324	209,459	66,575	6,290

County Name	Party Name	Candidate Name	Votes	Election Day Votes	Mail Votes	Provisional Votes
ADAMS	Democratic	BIDEN, JOSEPH ROBINETTE JR	18,207	6,611	11,436	160
ADAMS	Republican	TRUMP, DONALD J.	37,523	29,942	7,148	433
ADAMS	Libertarian	JORGENSEN, JO	810	544	256	10
ALLEGHENY	Democratic	BIDEN, JOSEPH ROBINETTE JR	429,065	148,171	273,080	7,814
ALLEGHENY	Republican	TRUMP, DONALD J.	282,324	209,459	66,575	6,290
ALLEGHENY	Libertarian	JORGENSEN, JO	8,344	5,085	3,076	183
ARMSTRONG	Democratic	BIDEN, JOSEPH ROBINETTE JR	8,457	4,275	4,108	74
ARMSTRONG	Republican	TRUMP, DONALD J.	27,489	24,233	2,917	339
ARMSTRONG	Libertarian	JORGENSEN, JO	424	340	78	6
BEAVER	Democratic	BIDEN, JOSEPH ROBINETTE JR	38,122	17,304	20,153	665

County Name	Party Name	Candidate Name	Votes	Election Day Votes	Mail Votes	Provisional Votes
ADAMS	Democratic	BIDEN, JOSEPH ROBINETTE JR	18207	6611	11436	160
ADAMS	Republican	TRUMP, DONALD J.	37523	29942	7148	433
ADAMS	Libertarian	JORGENSEN, JO	810	544	256	10
ALLEGHENY	Democratic	BIDEN, JOSEPH ROBINETTE JR	429065	148171	273080	7814
ALLEGHENY	Republican	TRUMP, DONALD J.	282324	209459	66575	6290
ALLEGHENY	Libertarian	JORGENSEN, JO	8344	5085	3076	183
ARMSTRONG	Democratic	BIDEN, JOSEPH ROBINETTE JR	8457	4275	4108	74
ARMSTRONG	Republican	TRUMP, DONALD J.	27489	24233	2917	339
ARMSTRONG	Libertarian	JORGENSEN, JO	424	340	78	6
BEAVER	Democratic	BIDEN, JOSEPH ROBINETTE JR	38122	17304	20153	665

County Name	Party Name	Candidate Name	Votes	Election Day Votes	Mail Votes	Provisional Votes	Media Market
ADAMS	Democratic	BIDEN, JOSEPH ROBINETTE JR	18207	6611	11436	160	Harrisburg Lancaster Lebanon York
ADAMS	Republican	TRUMP, DONALD J.	37523	29942	7148	433	Harrisburg Lancaster Lebanon York
ADAMS	Libertarian	JORGENSEN, JO	810	544	256	10	Harrisburg Lancaster Lebanon York
ALLEGHENY	Democratic	BIDEN, JOSEPH ROBINETTE JR	429065	148171	273080	7814	Pittsburgh
ALLEGHENY	Republican	TRUMP, DONALD J.	282324	209459	66575	6290	Pittsburgh
ALLEGHENY	Libertarian	JORGENSEN, JO	8344	5085	3076	183	Pittsburgh
ARMSTRONG	Democratic	BIDEN, JOSEPH ROBINETTE JR	8457	4275	4108	74	Pittsburgh
ARMSTRONG	Republican	TRUMP, DONALD J.	27489	24233	2917	339	Pittsburgh
ARMSTRONG	Libertarian	JORGENSEN, JO	424	340	78	6	Pittsburgh
BEAVER	Democratic	BIDEN, JOSEPH ROBINETTE JR	38122	17304	20153	665	Pittsburgh

Media Market	Votes sum	Election Day Votes sum	Mail Votes sum	Provisional Votes sum
Erie	199737	132314	64332	3091
Harrisburg Lancaster Lebanon York	1053343	695025	340934	17384
Johnstown Altoona	413636	301268	107983	4322
Philadelphia	2851396	1536882	1271094	43420
Pittsburgh	1560408	957033	577251	26124
Wilkes Barre Scranton	817297	556079	250240	10978

Lecture 19 – Exploratory Data Analysis: Voter Targeting in Pennsylvania¶

Data 6, Summer 2021¶

Step 1: Data Cleaning¶

Step 2: Exploratory Data Analysis¶

Additional Analysis: Targeting by Market and Understanding What Happened in 2020¶

Media Market	County
Erie	Crawford
Erie	Erie
Erie	McKean
Erie	Warren
Harrisburg Lancaster Lebanon York	Adams

Media Market	Party Name	Votes sum	Election Day Votes sum	Mail Votes sum	Provisional Votes sum
Erie	Democratic	87276	39620	46372	1284
Erie	Libertarian	2796	1969	789	38
Erie	Republican	109665	90725	17171	1769
Harrisburg Lancaster Lebanon York	Democratic	417953	176480	234112	7361
Harrisburg Lancaster Lebanon York	Libertarian	15387	10631	4438	318
Harrisburg Lancaster Lebanon York	Republican	620003	507914	102384	9705
Johnstown Altoona	Democratic	122931	54255	67480	1167
Johnstown Altoona	Libertarian	4822	3380	1383	57
Johnstown Altoona	Republican	285883	243633	39120	3098
Philadelphia	Democratic	1793078	716629	1048866	27583

Party Name	Votes sum	Election Day Votes sum	Mail Votes sum	Provisional Votes sum
Democratic	3453131	1406587	1993347	53168
Libertarian	79095	53095	24721	1277
Republican	3363591	2718919	593766	50874