Files
MAT12005-Project-in-Data-An…/Task 5.r
T
2026-06-24 16:37:45 +02:00

198 lines
6.9 KiB
R

# Loading and Merging Datasets
income <- read.csv("tulot2017.csv", encoding = "latin1")
str(income)
finland <- read.csv("ek2023.csv", encoding = "latin1")
str(finland)
df <- merge(finland, income, by = "Alue")
colnames(df)
# Loading Libraries
library(tidyverse)
library(ggplot2)
# Explorative Data Analysis
## Scatterplots
ggplot(data = df, mapping = aes(x = Tulot, y = SDP)) +
geom_point() +
labs(
x = "Average Taxable Income (euros)",
y = "Support for the SDP (%)"
)
plot_income_against_support <- function(income_type, party) {
income_type_english <- ""
if (income_type == "Tulot") {
income_type_english <- "Average Taxable Income"
} else if (income_type == "Mediaanitulot") {
income_type_english <- "Median Taxable Income"
} else if (income_type == "Ansiotulot") {
income_type_english <- "Average Earned Income"
} else if (income_type == "Pääomatulot") {
income_type_english <- "Average Investment Income"
} else if (income_type == "Tulot_miinus_verot") {
income_type_english <- "Average Income after Tax"
}
ggplot(data = df, mapping = aes(x = .data[[income_type]], y = .data[[party]])) +
geom_point() +
labs(
title = paste("Support for", party, "against", income_type_english),
x = paste(income_type_english, "(euros)"),
y = paste("Support for", party, "(%)"),
) +
theme_minimal()
}
plot_income_against_support("Tulot_miinus_verot", "SDP")
plot_income_against_support("Tulot_miinus_verot", "PS")
plot_income_against_support("Tulot_miinus_verot", "KOK")
plot_income_against_support("Tulot_miinus_verot", "KESK")
plot_income_against_support("Tulot_miinus_verot", "VIHR")
plot_income_against_support("Tulot_miinus_verot", "VAS")
plot_income_against_support("Tulot_miinus_verot", "RKP")
plot_income_against_support("Tulot_miinus_verot", "KD")
plot_income_against_support("Tulot_miinus_verot", "LIIKE")
## Heatmap
income_and_voting_columns <- c("Tulot", "Mediaanitulot", "Ansiotulot", "Pääomatulot", "Tulot_miinus_verot", "SDP", "PS", "KOK", "KESK", "VIHR", "VAS", "RKP", "KD", "LIIKE")
english_names <- c("Average Taxable Income", "Median Taxable Income", "Average Earned Income", "Average Investment Income", "Average Income after Tax", "SDP", "PS", "KOK", "KESK", "VIHR", "VAS", "RKP", "KD", "LIIKE")
cor_matrix <- cor(df[income_and_voting_columns])
cor_data <- as.data.frame(as.table(cor_matrix))
cor_data$Var1 <- factor(cor_data$Var1,
levels = income_and_voting_columns, labels =
english_names
)
cor_data$Var2 <- factor(cor_data$Var2,
levels = income_and_voting_columns, labels =
english_names
)
ggplot(cor_data, aes(Var1, Var2, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = round(Freq, 2)), color = "black", size = 4) +
scale_fill_gradient2(
low = "blue",
high = "red",
mid = "white",
midpoint = 0,
limit = c(-1, 1),
name = "Correlation"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
axis.title.x = element_blank(),
axis.title.y = element_blank()
)
## Stacked bar chart
parties <- c("SDP", "PS", "KOK", "KESK", "VIHR", "VAS", "RKP", "KD", "LIIKE")
weighted_districts <- df %>%
group_by(Vaalipiiri) %>%
summarize(across(all_of(parties), ~ weighted.mean(., Tulonsaajia)))
weighted_districts_long <- weighted_districts %>%
pivot_longer(cols = all_of(parties), names_to = "Party", values_to = "Support")
ggplot(weighted_districts_long, aes(x = Vaalipiiri, y = Support, fill = Party)) +
geom_bar(stat = "identity") +
labs(
x = "Electoral District",
y = "Weighted Average Support (%)",
fill = "Party"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme_minimal()
# Hypothesis 1
model <- lm(KOK ~ Tulot_miinus_verot, data = df)
summary(model)
ggplot(data = df, mapping = aes(x = Tulot_miinus_verot, y = KOK)) +
geom_point() +
geom_smooth(method = "lm") +
labs(
x = "Average Income after Tax (euros)",
y = "Support for KOK (%)",
) +
theme_minimal()
# Hypothesis 2
model <- lm(VIHR ~ Tulot_miinus_verot, data = df)
summary(model)
ggplot(data = df, mapping = aes(x = Tulot_miinus_verot, y = VIHR)) +
geom_point() +
geom_smooth(method = "lm") +
labs(
x = "Average Income after Tax (euros)",
y = "Support for VIHR (%)",
) +
theme_minimal()
# Hypothesis 3
model <- lm(PS ~ Tulot_miinus_verot, data = df)
summary(model)
# Hypothesis 4
model <- lm(KESK ~ Tulot_miinus_verot, data = df)
summary(model)
ggplot(data = df, mapping = aes(x = Tulot_miinus_verot, y = KESK)) +
geom_point() +
geom_smooth(method = "lm") +
labs(
x = "Average Income after Tax (euros)",
y = "Support for KESK (%)",
) +
theme_minimal()
# Hypothesis 5
urban_areas <- c(
"Helsinki", "Tampere", "Turku", "Oulu", "Jyväskylä", "Lahti", "Kuopio", "Pori",
"Joensuu", "Vaasa", "Lappeenranta", "Rovaniemi", "Seinäjoki", "Hämeenlinna",
"Porvoo", "Kotka", "Kouvola", "Hyvinkää", "Mikkeli", "Kokkola", "Rauma", "Lohja",
"Kajaani", "Salo", "Riihimäki", "Imatra", "Kemi", "Forssa", "Jakobstad",
"Savonlinna", "Kirkkonummi", "Raahe", "Varkaus", "Valkeakoski", "Tornio",
"Hamina", "Iisalmi", "Mariehamn", "Nummela", "Heinola", "Ilmajoki", "Kurikka",
"Pieksämäki", "Ylivieska", "Jämsä", "Nastola", "Mäntsälä", "Siilinjärvi", "Lapua",
"Uusikaupunki", "Vammala", "Söderkulla", "Pargas", "Orimattila", "Loimaa", "Ekenäs",
"Kauhajoki", "Äänekoski", "Paimio", "Toijala", "Kuusamo", "Laukaa", "Karis",
"Kankaanpää", "Nurmijärvi", "Turenki", "Mänttä", "Karkkila", "Hanko",
"Rajamäki", "Muurame", "Muhos", "Loviisa", "Lieksa", "Joutseno", "Kyröskoski",
"Parola", "Lauttakylä", "Laihia", "Kalajoki", "Iin Hamina", "Jokela", "Eura",
"Orivesi", "Veikkola", "Kyläsaari", "Pihlava", "Vuokatti", "Keuruu", "Valkeala",
"Myllykoski", "Kiiminki", "Laitila", "Toivala", "Vuorela", "Kauhava", "Vuores",
"Nivala", "Oulainen", "Kuhmo", "Liminka", "Viiala", "Suonenjoki"
)
df$Type <- ifelse(df$Alue %in% urban_areas, "urban", "rural")
urban_support <- df$KOK[df$Type == "urban"]
rural_support <- df$KOK[df$Type == "rural"]
result <- t.test(urban_support, rural_support, alternative = "greater", conf.level = 0.99)
print(result)
# Hypothesis 6
urban_support <- df$KESK[df$Type == "urban"]
rural_support <- df$KESK[df$Type == "rural"]
result <- t.test(urban_support, rural_support, alternative = "less", conf.level = 0.99)
print(result)
# Hypothesis 7
urban_support <- df$PS[df$Type == "urban"]
rural_support <- df$PS[df$Type == "rural"]
result <- t.test(urban_support, rural_support, alternative = "less", conf.level = 0.99)
print(result)
# Hypothesis 8
weighted <- df %>%
group_by(Vaalipiiri) %>%
summarise(weighted_sdp_support = sum(SDP * Tulonsaajia) / sum(Tulonsaajia))
result <- t.test(weighted$weighted_sdp_support, mu = 20, conf.level = 0.99)
print(result)
# Hypothesis 9
vaasa <- df$RKP[df$Vaalipiiri == "Vaasa"]
other <- df$RKP[df$Vaalipiiri != "Vaasa"]
result <- t.test(vaasa, other, alternative = "greater", conf.level = 0.99)
print(result)