# Loading and Merging Datasets income <- read.csv("tulot2017.csv", encoding = "latin1") str(income) finland <- read.csv("ek2023.csv", encoding = "latin1") str(finland) df <- merge(finland, income, by = "Alue") colnames(df) # Loading Libraries library(tidyverse) library(ggplot2) # Explorative Data Analysis ## Scatterplots ggplot(data = df, mapping = aes(x = Tulot, y = SDP)) + geom_point() + labs( x = "Average Taxable Income (euros)", y = "Support for the SDP (%)" ) plot_income_against_support <- function(income_type, party) { income_type_english <- "" if (income_type == "Tulot") { income_type_english <- "Average Taxable Income" } else if (income_type == "Mediaanitulot") { income_type_english <- "Median Taxable Income" } else if (income_type == "Ansiotulot") { income_type_english <- "Average Earned Income" } else if (income_type == "Pääomatulot") { income_type_english <- "Average Investment Income" } else if (income_type == "Tulot_miinus_verot") { income_type_english <- "Average Income after Tax" } ggplot(data = df, mapping = aes(x = .data[[income_type]], y = .data[[party]])) + geom_point() + labs( title = paste("Support for", party, "against", income_type_english), x = paste(income_type_english, "(euros)"), y = paste("Support for", party, "(%)"), ) + theme_minimal() } plot_income_against_support("Tulot_miinus_verot", "SDP") plot_income_against_support("Tulot_miinus_verot", "PS") plot_income_against_support("Tulot_miinus_verot", "KOK") plot_income_against_support("Tulot_miinus_verot", "KESK") plot_income_against_support("Tulot_miinus_verot", "VIHR") plot_income_against_support("Tulot_miinus_verot", "VAS") plot_income_against_support("Tulot_miinus_verot", "RKP") plot_income_against_support("Tulot_miinus_verot", "KD") plot_income_against_support("Tulot_miinus_verot", "LIIKE") ## Heatmap income_and_voting_columns <- c("Tulot", "Mediaanitulot", "Ansiotulot", "Pääomatulot", "Tulot_miinus_verot", "SDP", "PS", "KOK", "KESK", "VIHR", "VAS", "RKP", "KD", "LIIKE") english_names <- c("Average Taxable Income", "Median Taxable Income", "Average Earned Income", "Average Investment Income", "Average Income after Tax", "SDP", "PS", "KOK", "KESK", "VIHR", "VAS", "RKP", "KD", "LIIKE") cor_matrix <- cor(df[income_and_voting_columns]) cor_data <- as.data.frame(as.table(cor_matrix)) cor_data$Var1 <- factor(cor_data$Var1, levels = income_and_voting_columns, labels = english_names ) cor_data$Var2 <- factor(cor_data$Var2, levels = income_and_voting_columns, labels = english_names ) ggplot(cor_data, aes(Var1, Var2, fill = Freq)) + geom_tile(color = "white") + geom_text(aes(label = round(Freq, 2)), color = "black", size = 4) + scale_fill_gradient2( low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1), name = "Correlation" ) + theme_minimal() + theme( axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1), axis.title.x = element_blank(), axis.title.y = element_blank() ) ## Stacked bar chart parties <- c("SDP", "PS", "KOK", "KESK", "VIHR", "VAS", "RKP", "KD", "LIIKE") weighted_districts <- df %>% group_by(Vaalipiiri) %>% summarize(across(all_of(parties), ~ weighted.mean(., Tulonsaajia))) weighted_districts_long <- weighted_districts %>% pivot_longer(cols = all_of(parties), names_to = "Party", values_to = "Support") ggplot(weighted_districts_long, aes(x = Vaalipiiri, y = Support, fill = Party)) + geom_bar(stat = "identity") + labs( x = "Electoral District", y = "Weighted Average Support (%)", fill = "Party" ) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + theme_minimal() # Hypothesis 1 model <- lm(KOK ~ Tulot_miinus_verot, data = df) summary(model) ggplot(data = df, mapping = aes(x = Tulot_miinus_verot, y = KOK)) + geom_point() + geom_smooth(method = "lm") + labs( x = "Average Income after Tax (euros)", y = "Support for KOK (%)", ) + theme_minimal() # Hypothesis 2 model <- lm(VIHR ~ Tulot_miinus_verot, data = df) summary(model) ggplot(data = df, mapping = aes(x = Tulot_miinus_verot, y = VIHR)) + geom_point() + geom_smooth(method = "lm") + labs( x = "Average Income after Tax (euros)", y = "Support for VIHR (%)", ) + theme_minimal() # Hypothesis 3 model <- lm(PS ~ Tulot_miinus_verot, data = df) summary(model) # Hypothesis 4 model <- lm(KESK ~ Tulot_miinus_verot, data = df) summary(model) ggplot(data = df, mapping = aes(x = Tulot_miinus_verot, y = KESK)) + geom_point() + geom_smooth(method = "lm") + labs( x = "Average Income after Tax (euros)", y = "Support for KESK (%)", ) + theme_minimal() # Hypothesis 5 urban_areas <- c( "Helsinki", "Tampere", "Turku", "Oulu", "Jyväskylä", "Lahti", "Kuopio", "Pori", "Joensuu", "Vaasa", "Lappeenranta", "Rovaniemi", "Seinäjoki", "Hämeenlinna", "Porvoo", "Kotka", "Kouvola", "Hyvinkää", "Mikkeli", "Kokkola", "Rauma", "Lohja", "Kajaani", "Salo", "Riihimäki", "Imatra", "Kemi", "Forssa", "Jakobstad", "Savonlinna", "Kirkkonummi", "Raahe", "Varkaus", "Valkeakoski", "Tornio", "Hamina", "Iisalmi", "Mariehamn", "Nummela", "Heinola", "Ilmajoki", "Kurikka", "Pieksämäki", "Ylivieska", "Jämsä", "Nastola", "Mäntsälä", "Siilinjärvi", "Lapua", "Uusikaupunki", "Vammala", "Söderkulla", "Pargas", "Orimattila", "Loimaa", "Ekenäs", "Kauhajoki", "Äänekoski", "Paimio", "Toijala", "Kuusamo", "Laukaa", "Karis", "Kankaanpää", "Nurmijärvi", "Turenki", "Mänttä", "Karkkila", "Hanko", "Rajamäki", "Muurame", "Muhos", "Loviisa", "Lieksa", "Joutseno", "Kyröskoski", "Parola", "Lauttakylä", "Laihia", "Kalajoki", "Iin Hamina", "Jokela", "Eura", "Orivesi", "Veikkola", "Kyläsaari", "Pihlava", "Vuokatti", "Keuruu", "Valkeala", "Myllykoski", "Kiiminki", "Laitila", "Toivala", "Vuorela", "Kauhava", "Vuores", "Nivala", "Oulainen", "Kuhmo", "Liminka", "Viiala", "Suonenjoki" ) df$Type <- ifelse(df$Alue %in% urban_areas, "urban", "rural") urban_support <- df$KOK[df$Type == "urban"] rural_support <- df$KOK[df$Type == "rural"] result <- t.test(urban_support, rural_support, alternative = "greater", conf.level = 0.99) print(result) # Hypothesis 6 urban_support <- df$KESK[df$Type == "urban"] rural_support <- df$KESK[df$Type == "rural"] result <- t.test(urban_support, rural_support, alternative = "less", conf.level = 0.99) print(result) # Hypothesis 7 urban_support <- df$PS[df$Type == "urban"] rural_support <- df$PS[df$Type == "rural"] result <- t.test(urban_support, rural_support, alternative = "less", conf.level = 0.99) print(result) # Hypothesis 8 weighted <- df %>% group_by(Vaalipiiri) %>% summarise(weighted_sdp_support = sum(SDP * Tulonsaajia) / sum(Tulonsaajia)) result <- t.test(weighted$weighted_sdp_support, mu = 20, conf.level = 0.99) print(result) # Hypothesis 9 vaasa <- df$RKP[df$Vaalipiiri == "Vaasa"] other <- df$RKP[df$Vaalipiiri != "Vaasa"] result <- t.test(vaasa, other, alternative = "greater", conf.level = 0.99) print(result)