# # if you do not have the folder to begin with:
#from google.colab import drive
#drive.mount('/content/drive')#,force_remount=True)
#%cd content/drive/MyDrive/MadBeignet.github.io.git
# !git clone git@github.com:MadBeignet/MadBeignet.github.io.git
# !git clone https://github.com/MadBeignet/MadBeignet.github.io


#!ls


#%cd../../../


# # first, mount your google drive, change to the course folder, pull latest changes, and change to the lab folder.
#from google.colab import drive
#drive.mount('/content/drive',force_remount=True)
# %cd content/MadBeignet.github.io
# !git pull
%cd Data

[Errno 2] No such file or directory: 'Data'
/Users/maddiewisinski/Documents/GitHub/MadBeignet.github.io/Data

!ls

PIRUS_May2020      Protests           archive_folder
Population         Voter_Turnouts     fancy_website.html


#%cd "drive/MyDrive/MadBeignet.github.io/Data"


# imports
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import numpy as np
from matplotlib.pyplot import figure
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from numpy.core.getlimits import inf
pd.set_option('mode.chained_assignment',None)


csv_final = pd.read_csv("./Voter_Turnouts/2000 November General Election - Turnout Rates.csv",
                        header = None,
                        skiprows = 2)#first two rows is header in CSV
csv_final['Year']=2000

l = []#we will use this to make sure all files loaded
for a in range (2002,2012,2):
  csv_temp = pd.read_csv("./Voter_Turnouts/"+str(a)+" November General Election - Turnout Rates.csv",
                         header = None,
                         skiprows = 2)#first two rows are headers in csv
  csv_temp['Year']=a#As year is incremented,value changes
  l.append(1)
  csv_final = pd.concat([csv_final,csv_temp],ignore_index = True)
final_df = pd.DataFrame(csv_final)
print(len(l) == 5)#Test to make sure all files were uploaded, returns true if successful, false else
final_df.columns = ['Region', 'VEP Total Ballots Counted', 'VEP Highest Office', 'VAP Highest Office', 'Total Ballots Counted', 'Highest Office', 'Voting-Eligible Population (VEP)', 'Voting-Age Population (VAP)', '% Non-citizen', 'Prison', 'Probation', 'Parole', 'Total Ineligible Felon', 'Overseas Eligible', 'Year']
#Rename all columns

final_df

True


l = []#we will use this to make sure all files loaded
for a in range (2012,2016,2):
  csv_temp = pd.read_csv("./Voter_Turnouts/"+str(a)+" November General Election - Turnout Rates.csv",
                         header = None,
                         skiprows = 2,
                         names = ['Region', 'VEP Total Ballots Counted', 'VEP Highest Office', 'VAP Highest Office', 'Total Ballots Counted', 'Highest Office', 'Voting-Eligible Population (VEP)', 'Voting-Age Population (VAP)', '% Non-citizen', 'Prison', 'Probation', 'Parole', 'Total Ineligible Felon', 'Overseas Eligible', 'State Abv'])#first two rows are headers in csv
  csv_temp['Year']=a#As year is incremented,value changes
  csv_temp = csv_temp.iloc[:52]
  csv_temp.drop('State Abv',inplace=True,axis=1)
  csv_final = pd.concat([csv_final,csv_temp],ignore_index=True)#not sure whats going wrong now, I'll ask Dr. Culotta

csv_final


temp_18 = pd.read_csv("./Voter_Turnouts/2018 November General Election - Turnout Rates.csv",
                      names =['Region', 'Estimated or Actual 2018 Total Ballots Counted VEP Turnout Rate', '2018 Vote for Highest Office VEP Turnout Rate', 'Status', 'Source', 'Estimated or Actual 2018 Total Ballots Counted', '2018 Vote for Highest Office', 'Voting-Eligible Population (VEP)', 'Voting-Age Population (VAP)', '% Non-citizen', 'Prison', 'Probation', 'Parole', 'Total Ineligible Felon', 'Overseas Eligible', 'State Abv'],
                      skiprows=2,
                      header = None)
temp_18.drop('Source',inplace=True,axis=1)
temp_18.drop('Status',inplace=True,axis=1)
temp_18.drop('State Abv',inplace=True,axis=1)
csv_final.drop('VAP Highest Office',inplace=True,axis=1)
csv_final.columns
temp_18.columns = ['Region', 'VEP Total Ballots Counted', 'VEP Highest Office',
       'Total Ballots Counted', 'Highest Office',
       'Voting-Eligible Population (VEP)', 'Voting-Age Population (VAP)',
       '% Non-citizen', 'Prison', 'Probation', 'Parole',
       'Total Ineligible Felon', 'Overseas Eligible']
temp_18['Year']=2018
temp_18 = temp_18.iloc[:52]
csv_final = pd.concat([csv_final,temp_18],ignore_index = True)
csv_final


l = 'Region,Source,Status,Total Ballots Counted (Estimate),Vote for Highest Office (President),VEP Turnout Rate (Total Ballots Counted),VEP Turnout Rate (Highest Office),Voting-Eligible Population (VEP),Voting-Age Population (VAP),% Non-citizen,Prison,Probation,Parole,Total Ineligible Felon,Overseas Eligible,State Abv'
lis = l.split(',')
print(lis)

['Region', 'Source', 'Status', 'Total Ballots Counted (Estimate)', 'Vote for Highest Office (President)', 'VEP Turnout Rate (Total Ballots Counted)', 'VEP Turnout Rate (Highest Office)', 'Voting-Eligible Population (VEP)', 'Voting-Age Population (VAP)', '% Non-citizen', 'Prison', 'Probation', 'Parole', 'Total Ineligible Felon', 'Overseas Eligible', 'State Abv']


temp_16 = pd.read_csv("./Voter_Turnouts/2016 November General Election - Turnout Rates.csv",
                      names =['Region', 'State Results Website', 'Status', 'VEP Total Ballots Counted', 'VEP Highest Office', 'VAP Highest Office', 'Total Ballots Counted (Estimate)', 'Highest Office', 'Voting-Eligible Population (VEP)', 'Voting-Age Population (VAP)', '% Non-citizen', 'Prison', 'Probation', 'Parole', 'Total Ineligible Felon', 'Overseas Eligible', 'State Abv'],
                      skiprows=2,
                      header = None)
temp_16.drop('Status',inplace=True,axis=1)
temp_16.drop('State Results Website',inplace=True,axis=1)
temp_16.drop('State Abv',inplace=True,axis=1)
temp_16.drop('VAP Highest Office',inplace=True,axis=1)
temp_16['Year']=2016
temp_16.columns = csv_final.columns
temp_16 = temp_16.iloc[:52]
csv_final = pd.concat([csv_final,temp_16],ignore_index = True)
csv_final


temp_20 = pd.read_csv("./Voter_Turnouts/2020 November General Election - Turnout Rates.csv",
                      names =['Region', 'Source', 'Status', 'Total Ballots Counted (Estimate)', 'Vote for Highest Office (President)', 'VEP Turnout Rate (Total Ballots Counted)', 'VEP Turnout Rate (Highest Office)', 'Voting-Eligible Population (VEP)', 'Voting-Age Population (VAP)', '% Non-citizen', 'Prison', 'Probation', 'Parole', 'Total Ineligible Felon', 'Overseas Eligible', 'State Abv'],
                      skiprows=2,
                      header = None)
temp_20.drop('Source',inplace=True,axis=1)
temp_20.drop('Status',inplace=True,axis=1)
temp_20.drop('State Abv',inplace=True,axis=1)
temp_20 = temp_20[['Region','VEP Turnout Rate (Total Ballots Counted)','VEP Turnout Rate (Highest Office)','Total Ballots Counted (Estimate)', 'Vote for Highest Office (President)', 'Voting-Eligible Population (VEP)', 'Voting-Age Population (VAP)', '% Non-citizen', 'Prison', 'Probation', 'Parole', 'Total Ineligible Felon', 'Overseas Eligible']]
temp_20['Year'] = 2020
temp_20.columns = csv_final.columns
temp_20 = temp_20.iloc[:52]
csv_final = pd.concat([csv_final,temp_20],ignore_index = True)
csv_final


states_cleaned = []
for e in csv_final.Region:
    e = str(e).replace('*','')
    states_cleaned.append(e)
csv_final.Region = states_cleaned

pd.unique(csv_final.Region)

array(['United States', 'Alabama', 'Alaska', 'Arizona', 'Arkansas',
       'California', 'Colorado', 'Connecticut', 'Delaware',
       'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
       'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
       'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
       'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
       'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'], dtype=object)


a = "Subject_ID,Loc_Plot_State1,Loc_Plot_City1,Loc_Plot_State2,Loc_Plot_City2,Date_Exposure,Plot_Target1,Plot_Target2,Plot_Target3,Attack_Preparation,Op_Security,Changing_Target,Anticp_Fatals_Targ,Internet_Use_Plot,Extent_Plot,Violent,Criminal_Severity,Criminal_Charges,Indict_Arrest,Current_Status,Group_Membership,Terrorist_Group_Name1,Terrorist_Group_Name2,Terrorist_Group_Name3,Actively_Recruited,Recruiter1,Recruiter2,Recruiter3,Actively_Connect,Group_Competition,Role_Group,Length_Group,Clique,Clique_Radicalize,Clique_Connect,Internet_Radicalization,Media_Radicalization,Social_Media,Social_Media_Frequency,Social_Media_Platform1,Social_Media_Platform2,Social_Media_Platform3,Social_Media_Platform4,Social_Media_Platform5,Social_Media_Activities1,Social_Media_Activities2,Social_Media_Activities3,Social_Media_Activities4,Social_Media_Activities5,Social_Media_Activities6,Social_Media_Activities7,Radicalization_Islamist,Radicalization_Far_Right,Radicalization_Far_Left,Radicalization_Single_Issue,Ideological_Sub_Category1,Ideological_Sub_Category2,Ideological_Sub_Category3,Loc_Habitation_State1,Loc_Habitation_City1,Loc_Habitation_State2,Loc_Habitation_City2,Itinerant,External_Rad,Rad_duration,Radical_Behaviors,Radical_Beliefs,US_Govt_Leader,Foreign_Govt_Leader,Event_Influence1,Event_Influence2,Event_Influence3,Event_Influence4,Beliefs_Trajectory,Behaviors_Trajectory,Radicalization_Sequence,Radicalization_Place,Prison_Radicalize,Broad_Ethnicity,Age,Marital_Status,Children,Age_Child,Gender,Religious_Background,Convert,Convert_Date,Reawakening,Reawakening_Date,Citizenship,Residency_Status,Nativity,Time_US_Months,Immigrant_Generation,Immigrant_Source,Language_English,Diaspora_Ties,Education,Student,Education_Change,Employment_Status,Change_Performance,Work_History,Military,Foreign_Military,Social_Stratum_Childhood,Social_Stratum_Adulthood,Aspirations,Abuse_Child,Abuse_Adult,Abuse_type1,Abuse_Type2,Abuse_Type3,Psychological,Alcohol_Drug,Absent_Parent,Overseas_Family,Close_Family,Family_Religiosity,Family_Ideology,Family_Ideological_Level,Prison_Family_Friend,Crime_Family_Friend,Radical_Friend,Radical_Family,Radical_Signif_Other,Relationship_Troubles,Platonic_Troubles,Unstructured_Time,Friendship_Source1,Friendship_Source2,Friendship_Source3,Kicked_Out,Previous_Criminal_Activity,Previous_Criminal_Activity_Type1,Previous_Criminal_Activity_Type2,Previous_Criminal_Activity_Type3,Previous_Criminal_Activity_Age,Gang,Gang_Age_Joined,Trauma,Other_Ideologies,Angry_US,Group_Grievance,Standing"
def listify(mis_string):
  return mis_string.split(",")
pirus_headlist = listify(a)
print(pirus_headlist)

['Subject_ID', 'Loc_Plot_State1', 'Loc_Plot_City1', 'Loc_Plot_State2', 'Loc_Plot_City2', 'Date_Exposure', 'Plot_Target1', 'Plot_Target2', 'Plot_Target3', 'Attack_Preparation', 'Op_Security', 'Changing_Target', 'Anticp_Fatals_Targ', 'Internet_Use_Plot', 'Extent_Plot', 'Violent', 'Criminal_Severity', 'Criminal_Charges', 'Indict_Arrest', 'Current_Status', 'Group_Membership', 'Terrorist_Group_Name1', 'Terrorist_Group_Name2', 'Terrorist_Group_Name3', 'Actively_Recruited', 'Recruiter1', 'Recruiter2', 'Recruiter3', 'Actively_Connect', 'Group_Competition', 'Role_Group', 'Length_Group', 'Clique', 'Clique_Radicalize', 'Clique_Connect', 'Internet_Radicalization', 'Media_Radicalization', 'Social_Media', 'Social_Media_Frequency', 'Social_Media_Platform1', 'Social_Media_Platform2', 'Social_Media_Platform3', 'Social_Media_Platform4', 'Social_Media_Platform5', 'Social_Media_Activities1', 'Social_Media_Activities2', 'Social_Media_Activities3', 'Social_Media_Activities4', 'Social_Media_Activities5', 'Social_Media_Activities6', 'Social_Media_Activities7', 'Radicalization_Islamist', 'Radicalization_Far_Right', 'Radicalization_Far_Left', 'Radicalization_Single_Issue', 'Ideological_Sub_Category1', 'Ideological_Sub_Category2', 'Ideological_Sub_Category3', 'Loc_Habitation_State1', 'Loc_Habitation_City1', 'Loc_Habitation_State2', 'Loc_Habitation_City2', 'Itinerant', 'External_Rad', 'Rad_duration', 'Radical_Behaviors', 'Radical_Beliefs', 'US_Govt_Leader', 'Foreign_Govt_Leader', 'Event_Influence1', 'Event_Influence2', 'Event_Influence3', 'Event_Influence4', 'Beliefs_Trajectory', 'Behaviors_Trajectory', 'Radicalization_Sequence', 'Radicalization_Place', 'Prison_Radicalize', 'Broad_Ethnicity', 'Age', 'Marital_Status', 'Children', 'Age_Child', 'Gender', 'Religious_Background', 'Convert', 'Convert_Date', 'Reawakening', 'Reawakening_Date', 'Citizenship', 'Residency_Status', 'Nativity', 'Time_US_Months', 'Immigrant_Generation', 'Immigrant_Source', 'Language_English', 'Diaspora_Ties', 'Education', 'Student', 'Education_Change', 'Employment_Status', 'Change_Performance', 'Work_History', 'Military', 'Foreign_Military', 'Social_Stratum_Childhood', 'Social_Stratum_Adulthood', 'Aspirations', 'Abuse_Child', 'Abuse_Adult', 'Abuse_type1', 'Abuse_Type2', 'Abuse_Type3', 'Psychological', 'Alcohol_Drug', 'Absent_Parent', 'Overseas_Family', 'Close_Family', 'Family_Religiosity', 'Family_Ideology', 'Family_Ideological_Level', 'Prison_Family_Friend', 'Crime_Family_Friend', 'Radical_Friend', 'Radical_Family', 'Radical_Signif_Other', 'Relationship_Troubles', 'Platonic_Troubles', 'Unstructured_Time', 'Friendship_Source1', 'Friendship_Source2', 'Friendship_Source3', 'Kicked_Out', 'Previous_Criminal_Activity', 'Previous_Criminal_Activity_Type1', 'Previous_Criminal_Activity_Type2', 'Previous_Criminal_Activity_Type3', 'Previous_Criminal_Activity_Age', 'Gang', 'Gang_Age_Joined', 'Trauma', 'Other_Ideologies', 'Angry_US', 'Group_Grievance', 'Standing']


pirus_temp = pd.read_csv("./PIRUS_May2020/PIRUS_Public_May2020.csv",
                         header=1,
                         names = pirus_headlist)
pirus_temp


#Date_Exposure is not comparable because it is of dtype string.
#Create column 'Year' of int values that represents the last 2 digit of the year.
l = []
for val in pirus_temp['Date_Exposure']:
  a = val.split('/')
  b = a[-1:]
  l.append(int(b[0]))
pirus_temp['Year'] = l
#Any row with 'Year' under 22 occured in the 2000's and is in the scope of this study
pirus_temp = pirus_temp[pirus_temp['Year'] <= 22]
#Group by state
pirus_states_since_2000 = pirus_temp.value_counts('Loc_Habitation_State1')
pirus_states_since_2000.plot(kind='bar', figsize=(20, 10))

<AxesSubplot:xlabel='Loc_Habitation_State1'>


protests_temp = pd.read_csv("./Protests/data.csv",
                         header = 1,
                         names = ['Date','Location','Attendees',
                         'Event (legacy; see tags)','Tags',
                         'Curated','Source','Total_Articles'])
protests_temp.head(20)


#Must create state attribute to groupby state, similar to extracting year, but first create dictionary matching statges to abbreviation.
states = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","District of Columbia","Delaware","Florida","Georgia","Guam","Hawaii","Idaho","Illinois","Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland","Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana","Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York","North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania","Puerto Rico","Rhode Island","South Carolina","South Dakota","Tennessee","Texas","United States","Utah", "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]
abbrev = ["AL","AK","AZ","AR","CA","CO","CT","DC","DE","FL","GA","GU","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","PR","RI","SC","SD","TN","TX","US","UT","VT","VA","WA","WV","WI","WY"]
states_dict = {}
i = 0
for name in abbrev:
  states_dict[name] = states[i]
  i += 1
print(states_dict)
#create list of state names
protests_temp

{'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DC': 'District of Columbia', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia', 'GU': 'Guam', 'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi', 'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'PR': 'Puerto Rico', 'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'US': 'United States', 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming'}


#Create a list that can be added as a column to the DataFrame, representing the locatiion the protest took place in.
l = []
for val in protests_temp['Location']:
  m = val.split(',')
  if len(m) >= 2:
    n = m[-1][-2:]
    state = states_dict[n.upper()]
    l.append(state)
  else: #accounting for abnormal cases. implementation based on printing individual cases
    if m == 'La Porte County Courthouse in La Porte':
      l.append('Indiana')
    if m == 'Space':
      l.append('New York')
    if n == 'WA':
      l.append('Washington')
    if n == 'DE':
      l.append('Delaware')
protests_temp['State'] = l
#protests_temp


protests_by_state = protests_temp.value_counts('State')
protests_by_state.plot.barh(figsize=(10,15))

<AxesSubplot:ylabel='State'>


protests_by_state.mean()

718.7924528301887


pop20_21 = pd.read_csv('./Population/2020-2021 Census Bureau Population.csv')
#Rename columns due to header reading error
pop20_21.rename(columns={'Population Estimate\n (as of July 1)':'2020','Unnamed: 3':'2021'},inplace=True)
#Drop the first 6 rows becausethey are aggregates
pop20_21 = pop20_21.iloc[6:]
list1 = []
for i in pop20_21['Geographic Area']:
  i = i[1:]
  list1.append(i)
pop20_21['Geographic Area'] = list1
pop20_21.head()


pop10_19 = pd.read_csv('./Population/nst-est2019-01.csv')
#Rename columns due to header reading error
pop10_19.rename(columns={'Population Estimate (as of July 1)':'2010','Unnamed: 2':'Estimates Base','Unnamed: 4':'2011','Unnamed: 5':'2012','Unnamed: 5':'2012','Unnamed: 6':'2013','Unnamed: 7':'2014','Unnamed: 8':'2015','Unnamed: 9':'2016','Unnamed: 10':'2017','Unnamed: 11':'2018','Unnamed: 12': '2019'},inplace=True)
#Drop the first 6 rows becausethey are aggregates
pop10_19 = pop10_19.iloc[6:]
list1 = []
for i in pop10_19['Geographic Area']:
  i = i[1:]
  list1.append(i)
pop10_19['Geographic Area'] = list1
pop10_19.head()


total_population = pop10_19
total_population['2020'] = pop20_21['2020']
total_population['2021'] = pop20_21['2021']
total_population.drop(['April 1, 2010','Estimates Base'],inplace=True,axis=1)


total_population.columns

Index(['Geographic Area', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017', '2018', '2019', '2020', '2021'],
      dtype='object')


def df_creation(row):
  ret_val = pd.DataFrame()
  ret_val['Population'] = list(row)[1:]
  ret_val['Year'] = total_population.columns[1:]
  return ret_val

S_pop = {}
for index, row in total_population.iterrows():
  S_pop[list(row)[0]] = df_creation(row)


ax=S_pop['Alabama'].plot(x='Year',y='Population')


total_population


per_population_growth = total_population.copy()
years = [2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021]
for i in range(len(years)):
    per_population_growth[str(years[len(years) - i - 1])] = total_population[str(years[len(years)- i- 1])]/total_population['2010']-1

per_population_growth.drop(['2010'],inplace=True,axis=1)
per_population_growth.set_index('Geographic Area').transpose().plot(figsize=(10,15))
plt.legend()
#plt.yscale("log")
plt.xlabel("Years")
plt.ylabel("Population")
plt.title("Population Growth Over Time For Each State")
plt.grid(linestyle=':')

handles, labels = plt.gca().get_legend_handles_labels()
order = per_population_growth['2021'].sort_values(ascending=False).keys()
order = order-6

plt.legend([handles[idx] for idx in order],[labels[idx] for idx in order],bbox_to_anchor=(1., 1.0), fancybox=True, shadow=True, ncol=1)

<matplotlib.legend.Legend at 0x2a0647a60>


pd.pivot_table(total_population, index='Geographic Area').head()


resistance_data = pd.DataFrame()
resistance_data['Radicalized_num'] = pirus_states_since_2000
resistance_data['Protest_num'] = protests_by_state
resistance_data.head()


resistance_data.rename({'Loc_Habitation_State1':'State'},inplace=True)
resistance_data.plot(kind='scatter',
                     y='Radicalized_num',
                     x='Protest_num',
                     ylabel = "Number of People Radicalized",
                     xlabel = "Number of Protests",
                     figsize=(10,8),
                     alpha=0.4,
                     color='purple',
                     s=30)

<AxesSubplot:xlabel='Number of Protests', ylabel='Number of People Radicalized'>


resistance_data['Protest_num'].corr(resistance_data['Radicalized_num'])

0.8996117793328046


resistance_data_merged = resistance_data.reset_index().rename(columns={'Loc_Habitation_State1':'State'}).merge(total_population.rename(columns={'Geographic Area':'State',"2021":"Population"})[["State","Population"]],on='State', how="right").set_index("State")
resistance_data_merged.head()


resistance_data_merged["Population"]

State
Alabama      5039877.0
Alaska        732673.0
               ...    
Wisconsin    5895908.0
Wyoming       578803.0
Name: Population, Length: 51, dtype: float64


resistance_data_merged.plot(kind='scatter',
                     y='Radicalized_num',
                     x='Protest_num',
                     ylabel = "Number of People Radicalized",
                     xlabel = "Number of Protests",
                     title="Number of People Radicalized vs Number of Protests",
                     figsize=(10,8),
                     alpha=0.4,
                     color='purple',
                     #s = resistance_data_merged['Population']
                     s=resistance_data_merged["Population"].apply({lambda x: x/1e4}),
                     )
plt.xscale("log")
plt.yscale("log")
x_vals = list(resistance_data_merged.reset_index()["Protest_num"])
y_vals = list(resistance_data_merged.reset_index()["Radicalized_num"])
states = list(resistance_data_merged.reset_index()["State"])
x_vals.pop(41)
y_vals.pop(41)
states.pop(41) # South Dakota has nan values
for i in range(len(x_vals)):
    plt.text(x_vals[i], y_vals[i], states[i], fontsize=8)


resistance_data_normalized = resistance_data_merged.copy()
resistance_data_normalized["Protest_num"] = resistance_data_normalized["Protest_num"]/resistance_data_normalized["Population"]
resistance_data_normalized["Radicalized_num"] = resistance_data_normalized["Radicalized_num"]/resistance_data_normalized["Population"]
resistance_data_normalized.plot(kind='scatter',
                     y='Radicalized_num',
                     x='Protest_num',
                     ylabel = "Number of People Radicalized (Normalized by Population)",
                     xlabel = "Number of Protests (Normalized by Population)",
                     title="Number of People Radicalized vs Number of Protests (Normalized by Population)",
                     figsize=(10,8),
                     alpha=0.4,
                     color='purple',
                     #s = resistance_data_merged['Population']
                     s=resistance_data_normalized["Population"].apply({lambda x: x/1e4}))
plt.xscale("log")
plt.yscale("log")
x_vals = list(resistance_data_normalized.reset_index()["Protest_num"])
y_vals = list(resistance_data_normalized.reset_index()["Radicalized_num"])
x_vals.pop(41)
y_vals.pop(41)
for i in range(len(x_vals)):
    plt.text(x=x_vals[i], y=y_vals[i], s=states[i], fontsize=7)


protests_iss =protests_temp[['Date','Location','Event (legacy; see tags)', 'Attendees','State','Tags']]
protests_iss_known = protests_iss


protests_iss_known.rename(columns={'Event (legacy; see tags)':'Event'},inplace=True)


def categorizer(word):
    p_list = [r"\s*([Rr]acial)",
    r'\s*(45)', r"\s*([Gg]un\s[Rr]ights)",r"\s*([Gg]un\s[Cc]ontrol)",
     r"\s*([Oo]ther)", r"\s*([Ee]nvironment)", 
     r"\s*([Ee]ducation)",r'\s*([Hh]ealthcare)',
     r"\s*([Ii]mmigration)",r"\s*([Ee]xecutive)", 
     r"\s*([Ii]nternational\s[Rr]elations)",
     r"\s*([Ll]egislative)",r"\s*([Cc]ivil\s[Rr]ights)"]
    tag_dict = {r"\s*([Rr]acial)":"Racial",
    r'\s*(45)':"45th President", r"\s*([Gg]un\s[Rr]ights)":"Gun Rights",r"\s*([Gg]un\s[Cc]ontrol)":"Gun Control",
     r"\s*([Oo]ther)":'Other', r"\s*([Ee]nvironment)":'Environment', 
     r"\s*([Ee]ducation)":"Education",r'\s*([Hh]ealthcare)':'Healthcare',
     r"\s*([Ii]mmigration)":'Immigration',r"\s*([Ee]xecutive)":'Executive', 
     r"\s*([Ii]nternational\s[Rr]elations)":'International Relations',
     r"\s*([Ll]egislative)":'Legislative',r"\s*([Cc]ivil\s[Rr]ights)":'Civil Rights','[]':'Other'}
    ret_list = set([])
    for w in word.split(';'):
        #print(w)
        for pattern in p_list:
            m = re.search(pattern,w)
            if m != None:
                b = tag_dict[pattern]
                if b not in ret_list:
                    ret_list.add(b)
    return ret_list


events = []
for a in protests_iss_known["Tags"]:
    events.append(categorizer(a))
#eventsssss = protests_iss_known["Tags"].apply({lambda x: categorizer(x)}) # "Racial Injustice" if "Racial Injustice" in x else "Gun Rights" if "Guns" in x else "Other" if "Other" in x else "Environment" if "Environment" in x else "Education" if "Education" in x else "Immigration" if "Immigration" in x else x
protests_iss_known["Event"] = events
Common_Events = protests_iss_known["Event"].value_counts().head(12).keys()
#print(len(protests_iss_known["Event"].value_counts()))
#protests_iss_known["Event"] = protests_iss_known["Event"].apply({lambda x: x if x in Common_Events else "Other"})


def overlapping_value_count(df,return_dict):
    s = df['Event']
    for entry in s:
        l = list(entry)
        for e in l:
            if e in return_dict.keys():
                return_dict[e] += 1
            else:
                return_dict[e] = 1
    ret_val = pd.DataFrame(list(return_dict.items()),index=range(0,len(return_dict.keys())))
    ret_val.columns = ['Tag','Count']
    ret_val.set_index('Tag',inplace=True)
    return ret_val
tag_counts = overlapping_value_count(protests_iss_known,{})
tag_counts.sort_values("Count", ascending=False).plot(y='Count',kind='pie',figsize=(10,10),fontsize=10,legend=True,title='Protest Topics',colors=sns.color_palette('tab20'))
plt.legend(bbox_to_anchor=(1., 1.0), fancybox=True, shadow=True, ncol=1)

<matplotlib.legend.Legend at 0x2ab003fd0>


protests_iss_known.Date = pd.to_datetime(protests_iss_known.Date)
protests_real_test = protests_iss.query('Attendees != Attendees')
protests_iss_attendees_known = protests_iss.dropna(subset=['Attendees'])


protests_iss_attendees_known.Date.value_counts().plot(figsize=(15,10))

<AxesSubplot:>


protests_real_test


tag_unknown = overlapping_value_count(protests_real_test,{})
tag_unknown.sort_values("Count", ascending=False).plot(y='Count',kind='pie',figsize=(8,8),colors=sns.color_palette('tab20'))
plt.legend(bbox_to_anchor=(1., 1.0), fancybox=True, shadow=True, ncol=1)

<matplotlib.legend.Legend at 0x2a922d8e0>


protests_iss_known


pirus_temp.Date_Exposure = pd.to_datetime(pirus_temp.Date_Exposure)
type(pirus_temp.iloc[0].Date_Exposure)

pandas._libs.tslibs.timestamps.Timestamp


#How to select out certain Protest issues, when Event attribute is saved to a list:
def issue_search(issue):
    return protests_iss_known[protests_iss_known['Event']&{issue}]
def state_date(row):
    return (row.Date,row.State)
def state_year(row):
    return (row.Date.year,row.State)
def vote_pcnt(tuple):
    #print('tuple',tuple)
    year = tuple[0]
    state = tuple[1]
    if year%2 != 0:
        year -= 1
        #print(year)
    if state not in pd.unique(csv_final.Region):
        return (np.nan)
    line = str(csv_final[(csv_final.Region == state)&(csv_final.Year == year)]['VEP Highest Office'])
    #print(line)
    pcnt = re.search(r'(....%)',line)
    #print(pcnt.groups())
    return float(pcnt[0][:-1])
def get_rads_by_population(tuple):
    date = tuple[0]
    state = tuple[1]
    if state not in pd.unique(total_population['Geographic Area']):
        return ('NaN')
    radicals = pd.DataFrame(pirus_temp[(pirus_temp.Date_Exposure < pd.to_datetime(date))&(pirus_temp.Loc_Plot_State1 == state)&(pirus_temp.Date_Exposure > pd.to_datetime('2000-01-01 00:00:00'))]).size
    population = total_population[total_population['Geographic Area'] == state][str(date.year)]
    return list(population/radicals)[0]
def to_raw(string):
    return fr"{string}"


votes = ['NaN']*38096
rads = [0]*38096
for e in range(0,38096):
    pcnt = vote_pcnt(state_year(protests_iss_known.iloc[e]))
    votes[e] = pcnt
    var1 = state_date(protests_iss_known.iloc[e])
    rad = get_rads_by_population(var1)
    rads[e]=rad
protests_iss_known['State_voters'] = votes
protests_iss_known['Radicals'] = rads

all_tags= ['Racial','45th President', 'Gun Rights', 'Gun Control', 'Other', 'Environment', 'Education', 'Healthcare', 'Immigration', 'Executive', 'International Relations', 'Legislative', 'Civil Rights', 'Other']
for t in all_tags:
    protests_iss_known[t] = [0]*38096
    for e in list(issue_search(t).index):
        e = int(e)
        protests_iss_known.loc[e,t]=1


protests_iss_known.loc[protests_iss_known.Radicals == inf,'Radicals'] = 0
protests_iss_known.loc[protests_iss_known.Radicals == pd.NA,'Radicals'] = np.nan


protests_real_test = protests_iss_known[protests_iss_known.Attendees.isnull()]
protests_real_test = protests_real_test.dropna(subset =['Radicals'])
protests_real_test = protests_real_test.dropna(subset =['State_voters'])
protests_iss_attendees_known = protests_iss_known.dropna(subset=['Attendees'])
protests_iss_attendees_known = protests_iss_attendees_known.dropna(subset=['State_voters'])
protests_iss_attendees_known = protests_iss_attendees_known.dropna(subset=['Radicals'])
protests_iss_attendees_known = protests_iss_attendees_known.dropna(subset=['Radicals'])


protests_iss_attendees_known.Radicals = protests_iss_attendees_known.Radicals.astype('int')
protests_iss_attendees_known.Attendees = protests_iss_attendees_known.Attendees.astype('int')


percentile = []
threshold = []
for num in pd.unique(protests_iss_attendees_known.Attendees):
  percentile.append(protests_iss_attendees_known[protests_iss_attendees_known.Attendees < num].size/protests_iss_attendees_known.size)
  threshold.append(num)
eval = pd.DataFrame()
eval['Distribution'] = percentile
eval['Threshold'] = threshold
eval.plot(kind='scatter',x='Threshold',y='Distribution',figsize = (15,8),alpha=0.4,title='Attendance Percentile Values')

<AxesSubplot:title={'center':'Attendance Percentile Values'}, xlabel='Threshold', ylabel='Distribution'>


protests_iss_attendees_known.Date = pd.to_datetime(protests_iss_attendees_known.Date)


protests_iss_attendees_known.plot(kind='scatter',x='Date',y='Attendees',figsize=(15,8),s=8,c='red',alpha=0.7, title='Protest Attendance by Event 2017-2021')

<AxesSubplot:title={'center':'Protest Attendance by Event 2017-2021'}, xlabel='Date', ylabel='Attendees'>


protests_attendees_known_filtered = protests_iss_attendees_known.loc[protests_iss_attendees_known.Attendees<2500]


protests_attendees_known_filtered.plot(kind='scatter',x='Date',y='Attendees',figsize=(15,8),s=8,c='red',alpha=0.7, title='Protest Attendance by Event 2017-2021')

<AxesSubplot:title={'center':'Protest Attendance by Event 2017-2021'}, xlabel='Date', ylabel='Attendees'>


from sklearn.metrics import accuracy_score
protests_attendees_known_filtered.Date = protests_iss_attendees_known.Date.astype('int')
feats = ['Date','State','State_voters', 'Racial', '45th President', 'Gun Rights', 'Gun Control',
       'Other', 'Environment', 'Education', 'Healthcare', 'Immigration',
       'Executive', 'International Relations', 'Legislative', 'Civil Rights',
       'Radicals']
vec = DictVectorizer(sparse=False)
scaler = StandardScaler()
X_dict = protests_attendees_known_filtered[feats].to_dict(orient="records")
X_train = vec.fit_transform(X_dict)
y = protests_attendees_known_filtered["Attendees"]

# specify the pipeline
kays = []
accuracy = []
cvs = []

for num in range(1,75,2):
  model = KNeighborsRegressor(n_neighbors=num)
  pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])
  scores = cross_val_score(pipeline, X_dict, y, 
                         cv=10, scoring='neg_root_mean_squared_error')
  accuracy.append(scores.mean())
  kays.append(num)

for_plot = pd.DataFrame()
for_plot['K-value'] = kays
for_plot['RMSE'] = accuracy


for_plot['F1'] = accuracy
for_plot.rename(columns={'F1':'RMSE'},inplace=True)
for_plot['Class'] = 'Train'


plt1 = for_plot.plot(x='K-value',y='RMSE',figsize=(15,8),ylabel='Negative Root of Mean Squared Error',title="Negative Root of Mean Squared Error for Neighbors Across 10 Folds")
"""plt2 = for_plot[for_plot.Division == 2].plot(x='K-value',y='RMSE',ax=plt1)
plt3 = for_plot[for_plot.Division == 3].plot(x='K-value',y='RMSE',ax=plt1)
plt4 = for_plot[for_plot.Division == 4].plot(x='K-value',y='RMSE',ax=plt1)
plt5 = for_plot[for_plot.Division == 5].plot(x='K-value',y='RMSE',ax=plt1)
plt6 = for_plot[for_plot.Division == 6].plot(x='K-value',y='RMSE',ax=plt1)
plt7 = for_plot[for_plot.Division == 7].plot(x='K-value',y='RMSE',ax=plt1)
plt9 = for_plot[for_plot.Division == 9].plot(x='K-value',y='RMSE',ax=plt1)
plt8 = for_plot[for_plot.Division == 8].plot(x='K-value',y='RMSE',ax=plt1)
plt10 = for_plot[for_plot.Division == 10].plot(x='K-value',y='RMSE',ax=plt1)"""

"plt2 = for_plot[for_plot.Division == 2].plot(x='K-value',y='RMSE',ax=plt1)\nplt3 = for_plot[for_plot.Division == 3].plot(x='K-value',y='RMSE',ax=plt1)\nplt4 = for_plot[for_plot.Division == 4].plot(x='K-value',y='RMSE',ax=plt1)\nplt5 = for_plot[for_plot.Division == 5].plot(x='K-value',y='RMSE',ax=plt1)\nplt6 = for_plot[for_plot.Division == 6].plot(x='K-value',y='RMSE',ax=plt1)\nplt7 = for_plot[for_plot.Division == 7].plot(x='K-value',y='RMSE',ax=plt1)\nplt9 = for_plot[for_plot.Division == 9].plot(x='K-value',y='RMSE',ax=plt1)\nplt8 = for_plot[for_plot.Division == 8].plot(x='K-value',y='RMSE',ax=plt1)\nplt10 = for_plot[for_plot.Division == 10].plot(x='K-value',y='RMSE',ax=plt1)"


protests_iss_attendees_known[protests_iss_attendees_known.Attendees < 10000].plot(kind='scatter',x='Date',y='Attendees',figsize=(20,10),s=8,c='red',alpha=0.2)

<AxesSubplot:xlabel='Date', ylabel='Attendees'>


pirus_temp.Date_Exposure = pd.to_datetime(pirus_temp.Date_Exposure)


pirus_temp


pirus_temp.set_index('Date_Exposure')
rad_counts = pirus_temp.sort_index().value_counts('Date_Exposure',sort=False)
rad_counts.plot(x='Date_Exposure',figsize=(20,10))

#rad_counts.plot(x='Date_Exposure', y = '0')

<AxesSubplot:xlabel='Date_Exposure'>


pirus_temp.Year = pd.to_numeric(pirus_temp.Year)
since_17 = pirus_temp.loc[pirus_temp.Year>=17]
since_17.set_index('Date_Exposure')


rad_counts2 = since_17.sort_index().value_counts('Date_Exposure',sort=False)


rad_counts2 = rad_counts2.reset_index().rename(columns={"Date_Exposure":"Date",0:"freq"})


merged_data = protests_iss_attendees_known[["Date","Attendees"]].merge(rad_counts2[["Date","freq"]], on='Date', how='inner')
merged_data


fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
fig.set_size_inches(18.5, 10.5)

ax1.scatter(merged_data["Date"], merged_data["Attendees"], c='blue', s=50, alpha=0.4)
ax2.scatter(merged_data["Date"], merged_data["freq"], c='red', s=50, alpha=0.15)
plt.title("Temporary Title") # title this
ax1.set_xlabel("Date")
ax1.set_ylabel("Attendees")
ax2.set_ylabel("Frequency")

plt.show()


fin = pd.DataFrame(protests_iss_attendees_known.value_counts('State'),columns=['Known'])


fin2 = pd.DataFrame(protests_real_test.value_counts('State'),columns=['Unknown']).join(fin)
#fin['Unknown'] = fin2['Unknown']


fin2.plot(kind = 'scatter',figsize=(15,10),x='Known',y='Unknown')
states = list(fin2.reset_index()["State"])
x_vals = list(fin2.reset_index()["Known"])
y_vals = list(fin2.reset_index()["Unknown"])
'''for i in range(len(x_vals)):
    plt.text(x_vals[i], y_vals[i], states[i], fontsize=8)'''

'for i in range(len(x_vals)):\n    plt.text(x_vals[i], y_vals[i], states[i], fontsize=8)'


"""def issue_search_df(issue,df):
    return df[df['Event']&{issue}]"""
def autopct(pct):
    return ('%.2f' % pct)
tag_counts_real = overlapping_value_count(protests_real_test,{})
tag_counts_real.sort_values("Count",ascending=False).plot(y='Count',kind='pie',figsize=(10,10),fontsize=10,legend=True,autopct=autopct,title='Protest Topics',colors=sns.color_palette('tab20'))
plt.legend(bbox_to_anchor=(1., 1.0), fancybox=True, shadow=True, ncol=1)

<matplotlib.legend.Legend at 0x17f573970>


tag_counts_known = overlapping_value_count(protests_iss_attendees_known,{})
tag_counts_known.sort_values("Count",ascending=False).plot(y='Count',kind='pie',figsize=(10,10),autopct=autopct,fontsize=10,legend=True,title='Protest Topics',colors=sns.color_palette('tab20'))
plt.legend(bbox_to_anchor=(1., 1.0), fancybox=True, shadow=True, ncol=1)

<matplotlib.legend.Legend at 0x17f4f9f70>


protests_iss_attendees_known.head()


protests_iss_attendees_known.set_index('Date')
known_freq = protests_iss_attendees_known.sort_index().value_counts('Date',sort=False)#,columns=['Freq'],index=None)
axis = known_freq.plot(x='Date',figsize=(20,10))

protests_real_test.set_index('Date')
unknown_freq = protests_real_test.sort_index().value_counts('Date',sort=False)#,columns=['Freq'],index=None)
unknown_freq.plot(x='Date',figsize=(20,10),ax=axis)

#known_freq#.plot(x='Date',y="Freq", c='blue', alpha=0.4)


"""pirus_temp.set_index('Date_Exposure')
rad_counts = pirus_temp.sort_index().value_counts('Date_Exposure',sort=False)
rad_counts.plot(x='Date_Exposure',figsize=(20,10))"""

"pirus_temp.set_index('Date_Exposure')\nrad_counts = pirus_temp.sort_index().value_counts('Date_Exposure',sort=False)\nrad_counts.plot(x='Date_Exposure',figsize=(20,10))"


x =protests_iss_attendees_known.value_counts('State').plot(kind='bar',y='State',color='orange',figsize=(15,8))
protests_real_test.value_counts('State').plot(kind='bar',y='State',ax=x)

<AxesSubplot:xlabel='State'>


known = 23009
unknown = 15046
fin2['Unknown_pcnt'] = fin2['Unknown']/unknown
fin2['Known_pcnt'] = fin2['Known']/known
fin2.plot(kind='scatter',x='Unknown_pcnt',y='Known_pcnt',figsize=(15,8))

<AxesSubplot:xlabel='Unknown_pcnt', ylabel='Known_pcnt'>


fin2['Known'].corr(fin2['Unknown'])

0.9799869422463429


protests_attendees_known_filtered.Date = protests_iss_attendees_known.Date.astype('int')
protests_real_test.Date = protests_real_test.Date.astype('int')
pd.options.display.max_rows = 5

feats = ['Date','State','State_voters', 'Racial', '45th President', 'Gun Rights', 'Gun Control',
       'Other', 'Environment', 'Education', 'Healthcare', 'Immigration',
       'Executive', 'International Relations', 'Legislative', 'Civil Rights',
       'Radicals']
X_train_dict = protests_attendees_known_filtered[feats].to_dict(orient="records")
y_train = protests_attendees_known_filtered["Attendees"]

x_new = protests_real_test
X_new_dict=x_new[feats].to_dict(orient="records")

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)
X_new = vec.transform(X_new_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_new_sc = scaler.transform(X_new)

# K-Nearest Neighbors Model
model = KNeighborsRegressor(n_neighbors=55)
model.fit(X_train_sc, y_train)
protests_real_test['predicted_attendeance'] = model.predict(X_new_sc)


protests_attendees_known_filtered.Date = pd.to_datetime(protests_attendees_known_filtered.Date)
protests_real_test.Date = pd.to_datetime(protests_real_test.Date)
fin_axis = protests_attendees_known_filtered.plot(kind="scatter",x="Date",y="Attendees",c='green',alpha=0.4,figsize=(18,10))
protests_real_test.plot(kind="scatter",x="Date",y="predicted_attendeance",ax=fin_axis,alpha=0.4)

<AxesSubplot:xlabel='Date', ylabel='predicted_attendeance'>

	Region	VEP Total Ballots Counted	VEP Highest Office	VAP Highest Office	Total Ballots Counted	Highest Office	Voting-Eligible Population (VEP)	Voting-Age Population (VAP)	% Non-citizen	Prison	Probation	Parole	Total Ineligible Felon	Overseas Eligible	Year
0	United States	55.3%	54.2%	50.0%	107,390,107	105,375,486	194,331,436	210,623,408	7.7%	1,377,013	2,339,388	536,039	3,082,746	2,937,000	2000
1	Alabama	NaN	51.6%	50.1%	NaN	1,672,551	3,241,682	3,334,576	1.5%	26,225	40,178	5,484	51,798	NaN	2000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
310	Wisconsin	52.4%	52.0%	49.7%	2,185,021	2,171,331	4,172,130	4,365,214	3.2%	22,724	22,602	19,572	55,112	NaN	2010
311	Wyoming	46.0%	45.5%	43.8%	190,822	188,463	414,536	430,673	2.4%	2,059	3,231	682	5,684	NaN	2010

	Region	VEP Total Ballots Counted	VEP Highest Office	VAP Highest Office	Total Ballots Counted	Highest Office	Voting-Eligible Population (VEP)	Voting-Age Population (VAP)	% Non-citizen	Prison	Probation	Parole	Total Ineligible Felon	Overseas Eligible	Year
0	United States	55.3%	54.2%	50.0%	107,390,107	105,375,486	194,331,436	210,623,408	7.7%	1,377,013	2,339,388	536,039	3,082,746	2,937,000	2000
1	Alabama	NaN	51.6%	50.1%	NaN	1,672,551	3,241,682	3,334,576	1.5%	26,225	40,178	5,484	51,798	NaN	2000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
414	Wisconsin	56.9%	56.6%	53.9%	2,422,248	2,410,314	4,260,427	4,454,970	3.1%	22,097	46,212	20,010	67,986	NaN	2014
415	Wyoming	39.7%	39.0%	37.3%	171,153	168,390	431,434	445,626	2.7%	2,330	5,196	715	5,955	NaN	2014

	Region	VEP Total Ballots Counted	VEP Highest Office	Total Ballots Counted	Highest Office	Voting-Eligible Population (VEP)	Voting-Age Population (VAP)	% Non-citizen	Prison	Probation	Parole	Total Ineligible Felon	Overseas Eligible	Year
0	United States	55.3%	54.2%	107,390,107	105,375,486	194,331,436	210,623,408	7.7%	1,377,013	2,339,388	536,039	3,082,746	2,937,000	2000
1	Alabama	NaN	51.6%	NaN	1,672,551	3,241,682	3,334,576	1.5%	26,225	40,178	5,484	51,798	NaN	2000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
466	Wisconsin	61.4%	61.4%	2,675,000	2,673,308	4,354,527	4,563,564	3.1%	22,889	44,489	20,401	68,649	NaN	2018
467	Wyoming	47.9%	47.4%	205,275	203,420	428,898	445,747	2.5%	2,323	4,666	842	5,825	NaN	2018

	Region	VEP Total Ballots Counted	VEP Highest Office	Total Ballots Counted	Highest Office	Voting-Eligible Population (VEP)	Voting-Age Population (VAP)	% Non-citizen	Prison	Probation	Parole	Total Ineligible Felon	Overseas Eligible	Year
0	United States	55.3%	54.2%	107,390,107	105,375,486	194,331,436	210,623,408	7.7%	1,377,013	2,339,388	536,039	3,082,746	2,937,000	2000
1	Alabama	NaN	51.6%	NaN	1,672,551	3,241,682	3,334,576	1.5%	26,225	40,178	5,484	51,798	NaN	2000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
570	Wisconsin	75.8%	75.5%	3,310,000	3,298,041	4,368,530	4,586,746	3.2%	23,574	42,909	21,015	71,193	NaN	2020
571	Wyoming	64.6%	64.2%	278,503	276,765	431,364	447,915	2.2%	2,488	5,383	934	6,759	NaN	2020

	Date	Location	Attendees	Event (legacy; see tags)	Tags	Curated	Source	Total_Articles
0	2017-01-16	Johnson City, TN	300.0	Civil Rights	Civil Rights; For racial justice; Martin Luthe...	Yes	http://www.johnsoncitypress.com/Local/2017/01/...	4
1	2017-01-16	Indianapolis, IN	20.0	Environment	Environment; For wilderness preservation	Yes	http://wishtv.com/2017/01/16/nature-groups-pro...	1
...	...	...	...	...	...	...	...	...
18	2017-01-20	Richmond, VA	2000.0	Executive (Inauguration March)	Executive; Against 45th president	Yes	http://richmondfreepress.com/news/2017/jan/20/...	2
19	2017-01-20	Madison, WI	100.0	Executive	Executive; Against 45th president	Yes	http://www.channel3000.com/news/politics/peace...	1

Project Goals

Voter Turnout: 2000-2022

Cleaning the Data

Step 1: Concatenate years 2000-2010

Step 2: Drop State Abbreviations and Excess Rows

2018

2016

2020

Radicalized Individuals in the United States

1. Loading the PIRUS Data

Protests in the United States

Population Data

Merging Data

Protests and Radicalized Individuals Based On State

Normalized Protests and Radicalized Individuals

Models

Model 1: Preface

Tagging System

Model 1: Building the Model</h1>
We previously saved the protests with unknown attendees to the DataFrame protests_real_test. Let's revisit that data.

Model 2

Model 2

Missing Attendance Data

Conclusion

	Subject_ID	Loc_Plot_State1	Loc_Plot_City1	Loc_Plot_State2	Loc_Plot_City2	Date_Exposure	Plot_Target1	Plot_Target2	Plot_Target3	Attack_Preparation	...	Previous_Criminal_Activity_Type2	Previous_Criminal_Activity_Type3	Previous_Criminal_Activity_Age	Gang	Gang_Age_Joined	Trauma	Other_Ideologies	Angry_US	Group_Grievance	Standing
0	4857	New York	-99	NaN	NaN	1/1/49	-88	NaN	NaN	-88	...	NaN	NaN	-99	0	-88	3	0	1	-99	-99
1	5803	Alabama	Birmingham	NaN	NaN	1/1/49	-88	NaN	NaN	-88	...	NaN	NaN	-88	0	-88	-99	0	-99	2	-99
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2223	1374	California	Los Angeles	NaN	NaN	11/26/18	14	NaN	NaN	1	...	NaN	NaN	-99	0	-88	-99	0	-99	-99	-99
2224	8295	Ohio	Toledo	NaN	NaN	12/10/18	3	14.0	15.0	2	...	NaN	NaN	2	0	-88	-99	0	1	3	-99

	Geographic Area	April 1, 2020 Estimates Base	2020	2021
6	Alabama	5024279.0	5024803.0	5039877.0
7	Alaska	733391.0	732441.0	732673.0
8	Arizona	7151502.0	7177986.0	7276316.0
9	Arkansas	3011524.0	3012232.0	3025891.0
10	California	39538223.0	39499738.0	39237836.0

	Geographic Area	April 1, 2010	Estimates Base	2010	2011	2012	2013	2014	2015	2016	2017	2018	2019
6	Alabama	4779736.00	4780125.00	4785437.0	4799069.0	4815588.0	4830081.0	4841799.0	4852347.0	4863525.0	4874486.0	4887681.0	4903185.0
7	Alaska	710231.00	710249.00	713910.0	722128.0	730443.0	737068.0	736283.0	737498.0	741456.0	739700.0	735139.0	731545.0
8	Arizona	6392017.00	6392288.00	6407172.0	6472643.0	6554978.0	6632764.0	6730413.0	6829676.0	6941072.0	7044008.0	7158024.0	7278717.0
9	Arkansas	2915918.00	2916031.00	2921964.0	2940667.0	2952164.0	2959400.0	2967392.0	2978048.0	2989918.0	3001345.0	3009733.0	3017804.0
10	California	37253956.00	37254519.00	37319502.0	37638369.0	37948800.0	38260787.0	38596972.0	38918045.0	39167117.0	39358497.0	39461588.0	39512223.0

	2010	2011	2012	2013	2014	2015	2016	2017	2018	2019	2020	2021
Geographic Area
Alabama	4785437.0	4799069.0	4815588.0	4830081.0	4841799.0	4852347.0	4863525.0	4874486.0	4887681.0	4903185.0	5024803.0	5039877.0
Alaska	713910.0	722128.0	730443.0	737068.0	736283.0	737498.0	741456.0	739700.0	735139.0	731545.0	732441.0	732673.0
Arizona	6407172.0	6472643.0	6554978.0	6632764.0	6730413.0	6829676.0	6941072.0	7044008.0	7158024.0	7278717.0	7177986.0	7276316.0
Arkansas	2921964.0	2940667.0	2952164.0	2959400.0	2967392.0	2978048.0	2989918.0	3001345.0	3009733.0	3017804.0	3012232.0	3025891.0
California	37319502.0	37638369.0	37948800.0	38260787.0	38596972.0	38918045.0	39167117.0	39358497.0	39461588.0	39512223.0	39499738.0	39237836.0

	Radicalized_num	Protest_num
Loc_Habitation_State1
California	137	4439.0
New York	108	2688.0
Texas	84	1649.0
Florida	83	1823.0
Minnesota	70	747.0

	Radicalized_num	Protest_num	Population
State
Alabama	24.0	281.0	5039877.0
Alaska	8.0	252.0	732673.0
Arizona	36.0	563.0	7276316.0
Arkansas	7.0	174.0	3025891.0
California	137.0	4439.0	39237836.0

	Date	Location	Event	Attendees	State	Tags
2	2017-01-16	Cincinnati, OH	{Racial, Civil Rights}	NaN	Ohio	Civil Rights; For racial justice; Martin Luthe...
4	2017-01-19	Washington, DC	{45th President, Executive}	NaN	District of Columbia	Executive; Against 45th president
...	...	...	...	...	...	...
38092	2021-01-31	Topeka, KS	{Civil Rights}	NaN	Kansas	Civil Rights; For abortion rights
38094	2021-01-31	Salt Lake City, UT	{Other}	NaN	Utah	Other; Against deregulation; Business

	Subject_ID	Loc_Plot_State1	Loc_Plot_City1	Loc_Plot_State2	Loc_Plot_City2	Date_Exposure	Plot_Target1	Plot_Target2	Plot_Target3	Attack_Preparation	...	Previous_Criminal_Activity_Type3	Previous_Criminal_Activity_Age	Gang	Gang_Age_Joined	Trauma	Other_Ideologies	Angry_US	Group_Grievance	Standing	Year
882	3005	-99	-99	NaN	NaN	2000-01-01	-88	NaN	NaN	-88	...	NaN	-88	0	-88	-99	0	1	-99	-99	0
883	3655	Montana	-99	NaN	NaN	2000-01-01	-88	NaN	NaN	-88	...	NaN	-99	0	-88	-99	0	-99	-99	-99	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2223	1374	California	Los Angeles	NaN	NaN	2018-11-26	14	NaN	NaN	1	...	NaN	-99	0	-88	-99	0	-99	-99	-99	18
2224	8295	Ohio	Toledo	NaN	NaN	2018-12-10	3	14.0	15.0	2	...	NaN	2	0	-88	-99	0	1	3	-99	18

	Subject_ID	Loc_Plot_State1	Loc_Plot_City1	Loc_Plot_State2	Loc_Plot_City2	Plot_Target1	Plot_Target2	Plot_Target3	Attack_Preparation	Op_Security	...	Previous_Criminal_Activity_Type3	Previous_Criminal_Activity_Age	Gang	Gang_Age_Joined	Trauma	Other_Ideologies	Angry_US	Group_Grievance	Standing	Year
Date_Exposure
2017-01-01	6610	California	Los Molinos	Oregon	NaN	-88	NaN	NaN	-88	-88	...	NaN	-88	0	-88	0	0	0	0	0	17
2017-01-01	6734	Minnesota	Minneapolis	NaN	NaN	-88	NaN	NaN	-88	-88	...	NaN	-99	0	-88	-99	0	1	2	0	17
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2018-11-26	1374	California	Los Angeles	NaN	NaN	14	NaN	NaN	1	-99	...	NaN	-99	0	-88	-99	0	-99	-99	-99	18
2018-12-10	8295	Ohio	Toledo	NaN	NaN	3	14.0	15.0	2	2	...	NaN	2	0	-88	-99	0	1	3	-99	18

	Date	Attendees	freq
0	2017-01-16	300	9
1	2017-01-16	20	9
...	...	...	...
2947	2018-12-10	300	1
2948	2018-12-10	20	1

Project Goals

Voter Turnout: 2000-2022

Cleaning the Data

Step 1: Concatenate years 2000-2010

Step 2: Drop State Abbreviations and Excess Rows

2018

2016

2020

Radicalized Individuals in the United States

1. Loading the PIRUS Data

Protests in the United States

Population Data

Merging Data

Protests and Radicalized Individuals Based On State

Normalized Protests and Radicalized Individuals

Models

Model 1: Preface

Tagging System

Model 1: Building the Model</h1>We previously saved the protests with unknown attendees to the DataFrame protests_real_test. Let's revisit that data.

Model 2

Model 2

Missing Attendance Data

Conclusion

Model 1: Building the Model</h1>
We previously saved the protests with unknown attendees to the DataFrame protests_real_test. Let's revisit that data.