# Loading and Merging Datasets
income <- read.csv("tulot2017.csv", encoding = "latin1")
str(income)

finland <- read.csv("ek2023.csv", encoding = "latin1")
str(finland)

df <- merge(finland, income, by = "Alue")
colnames(df)

# Loading Libraries
library(tidyverse)
library(ggplot2)

# Explorative Data Analysis
## Scatterplots
ggplot(data = df, mapping = aes(x = Tulot, y = SDP)) +
  geom_point() +
  labs(
    x = "Average Taxable Income (euros)",
    y = "Support for the SDP (%)"
  )

plot_income_against_support <- function(income_type, party) {
  income_type_english <- ""
  if (income_type == "Tulot") {
    income_type_english <- "Average Taxable Income"
  } else if (income_type == "Mediaanitulot") {
    income_type_english <- "Median Taxable Income"
  } else if (income_type == "Ansiotulot") {
    income_type_english <- "Average Earned Income"
  } else if (income_type == "Pääomatulot") {
    income_type_english <- "Average Investment Income"
  } else if (income_type == "Tulot_miinus_verot") {
    income_type_english <- "Average Income after Tax"
  }
  ggplot(data = df, mapping = aes(x = .data[[income_type]], y = .data[[party]])) +
    geom_point() +
    labs(
      title = paste("Support for", party, "against", income_type_english),
      x = paste(income_type_english, "(euros)"),
      y = paste("Support for", party, "(%)"),
    ) +
    theme_minimal()
}

plot_income_against_support("Tulot_miinus_verot", "SDP")
plot_income_against_support("Tulot_miinus_verot", "PS")
plot_income_against_support("Tulot_miinus_verot", "KOK")
plot_income_against_support("Tulot_miinus_verot", "KESK")
plot_income_against_support("Tulot_miinus_verot", "VIHR")
plot_income_against_support("Tulot_miinus_verot", "VAS")
plot_income_against_support("Tulot_miinus_verot", "RKP")
plot_income_against_support("Tulot_miinus_verot", "KD")
plot_income_against_support("Tulot_miinus_verot", "LIIKE")

## Heatmap
income_and_voting_columns <- c("Tulot", "Mediaanitulot", "Ansiotulot", "Pääomatulot", "Tulot_miinus_verot", "SDP", "PS", "KOK", "KESK", "VIHR", "VAS", "RKP", "KD", "LIIKE")
english_names <- c("Average Taxable Income", "Median Taxable Income", "Average Earned Income", "Average Investment Income", "Average Income after Tax", "SDP", "PS", "KOK", "KESK", "VIHR", "VAS", "RKP", "KD", "LIIKE")
cor_matrix <- cor(df[income_and_voting_columns])
cor_data <- as.data.frame(as.table(cor_matrix))
cor_data$Var1 <- factor(cor_data$Var1,
  levels = income_and_voting_columns, labels =
    english_names
)
cor_data$Var2 <- factor(cor_data$Var2,
  levels = income_and_voting_columns, labels =
    english_names
)
ggplot(cor_data, aes(Var1, Var2, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(Freq, 2)), color = "black", size = 4) +
  scale_fill_gradient2(
    low = "blue",
    high = "red",
    mid = "white",
    midpoint = 0,
    limit = c(-1, 1),
    name = "Correlation"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
    axis.title.x = element_blank(),
    axis.title.y = element_blank()
  )

## Stacked bar chart
parties <- c("SDP", "PS", "KOK", "KESK", "VIHR", "VAS", "RKP", "KD", "LIIKE")
weighted_districts <- df %>%
  group_by(Vaalipiiri) %>%
  summarize(across(all_of(parties), ~ weighted.mean(., Tulonsaajia)))
weighted_districts_long <- weighted_districts %>%
  pivot_longer(cols = all_of(parties), names_to = "Party", values_to = "Support")
ggplot(weighted_districts_long, aes(x = Vaalipiiri, y = Support, fill = Party)) +
  geom_bar(stat = "identity") +
  labs(
    x = "Electoral District",
    y = "Weighted Average Support (%)",
    fill = "Party"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme_minimal()

# Hypothesis 1
model <- lm(KOK ~ Tulot_miinus_verot, data = df)
summary(model)

ggplot(data = df, mapping = aes(x = Tulot_miinus_verot, y = KOK)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(
    x = "Average Income after Tax (euros)",
    y = "Support for KOK (%)",
  ) +
  theme_minimal()

# Hypothesis 2
model <- lm(VIHR ~ Tulot_miinus_verot, data = df)
summary(model)

ggplot(data = df, mapping = aes(x = Tulot_miinus_verot, y = VIHR)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(
    x = "Average Income after Tax (euros)",
    y = "Support for VIHR (%)",
  ) +
  theme_minimal()

# Hypothesis 3
model <- lm(PS ~ Tulot_miinus_verot, data = df)
summary(model)

# Hypothesis 4
model <- lm(KESK ~ Tulot_miinus_verot, data = df)
summary(model)

ggplot(data = df, mapping = aes(x = Tulot_miinus_verot, y = KESK)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(
    x = "Average Income after Tax (euros)",
    y = "Support for KESK (%)",
  ) +
  theme_minimal()

# Hypothesis 5
urban_areas <- c(
  "Helsinki", "Tampere", "Turku", "Oulu", "Jyväskylä", "Lahti", "Kuopio", "Pori",
  "Joensuu", "Vaasa", "Lappeenranta", "Rovaniemi", "Seinäjoki", "Hämeenlinna",
  "Porvoo", "Kotka", "Kouvola", "Hyvinkää", "Mikkeli", "Kokkola", "Rauma", "Lohja",
  "Kajaani", "Salo", "Riihimäki", "Imatra", "Kemi", "Forssa", "Jakobstad",
  "Savonlinna", "Kirkkonummi", "Raahe", "Varkaus", "Valkeakoski", "Tornio",
  "Hamina", "Iisalmi", "Mariehamn", "Nummela", "Heinola", "Ilmajoki", "Kurikka",
  "Pieksämäki", "Ylivieska", "Jämsä", "Nastola", "Mäntsälä", "Siilinjärvi", "Lapua",
  "Uusikaupunki", "Vammala", "Söderkulla", "Pargas", "Orimattila", "Loimaa", "Ekenäs",
  "Kauhajoki", "Äänekoski", "Paimio", "Toijala", "Kuusamo", "Laukaa", "Karis",
  "Kankaanpää", "Nurmijärvi", "Turenki", "Mänttä", "Karkkila", "Hanko",
  "Rajamäki", "Muurame", "Muhos", "Loviisa", "Lieksa", "Joutseno", "Kyröskoski",
  "Parola", "Lauttakylä", "Laihia", "Kalajoki", "Iin Hamina", "Jokela", "Eura",
  "Orivesi", "Veikkola", "Kyläsaari", "Pihlava", "Vuokatti", "Keuruu", "Valkeala",
  "Myllykoski", "Kiiminki", "Laitila", "Toivala", "Vuorela", "Kauhava", "Vuores",
  "Nivala", "Oulainen", "Kuhmo", "Liminka", "Viiala", "Suonenjoki"
)

df$Type <- ifelse(df$Alue %in% urban_areas, "urban", "rural")

urban_support <- df$KOK[df$Type == "urban"]
rural_support <- df$KOK[df$Type == "rural"]
result <- t.test(urban_support, rural_support, alternative = "greater", conf.level = 0.99)
print(result)

# Hypothesis 6
urban_support <- df$KESK[df$Type == "urban"]
rural_support <- df$KESK[df$Type == "rural"]
result <- t.test(urban_support, rural_support, alternative = "less", conf.level = 0.99)
print(result)

# Hypothesis 7
urban_support <- df$PS[df$Type == "urban"]
rural_support <- df$PS[df$Type == "rural"]
result <- t.test(urban_support, rural_support, alternative = "less", conf.level = 0.99)
print(result)

# Hypothesis 8
weighted <- df %>%
  group_by(Vaalipiiri) %>%
  summarise(weighted_sdp_support = sum(SDP * Tulonsaajia) / sum(Tulonsaajia))
result <- t.test(weighted$weighted_sdp_support, mu = 20, conf.level = 0.99)
print(result)

# Hypothesis 9
vaasa <- df$RKP[df$Vaalipiiri == "Vaasa"]
other <- df$RKP[df$Vaalipiiri != "Vaasa"]
result <- t.test(vaasa, other, alternative = "greater", conf.level = 0.99)
print(result)