MAT12005-Project-in-Data-An…/Task 1-4.r

# Task 1
# a)
library(ggplot2)
pkseutu <- read.csv("pkseutu2015.csv", encoding = "latin1")

ggplot(data = pkseutu) +
  geom_point(mapping = aes(x = Prosentti, y = KOK)) +
  labs(
    title = "Support for KOK versus turnout",
    x = "Voter turnout (%)",
    y = "KOK support (%)"
  )

ggplot(data = pkseutu) +
  geom_point(mapping = aes(x = Prosentti, y = SDP)) +
  labs(
    title = "Support for SDP versus turnout",
    x = "Voter turnout (%)",
    y = "SDP support (%)"
  )

ggplot(data = pkseutu) +
  geom_point(mapping = aes(x = Prosentti, y = PS)) +
  labs(
    title = "Support for PS versus turnout",
    x = "Voter turnout (%)",
    y = "PS support (%)"
  )

ggplot(data = pkseutu) +
  geom_point(mapping = aes(x = Prosentti, y = KESK)) +
  labs(
    title = "Support for KESK versus turnout",
    x = "Voter turnout (%)",
    y = "KESK support (%)"
  )

ggplot(data = pkseutu) +
  geom_point(mapping = aes(x = Prosentti, y = VAS)) +
  labs(
    title = "Support for VAS versus turnout",
    x = "Voter turnout (%)",
    y = "VAS support (%)"
  )

ggplot(data = pkseutu) +
  geom_point(mapping = aes(x = Prosentti, y = VIHR)) +
  labs(
    title = "Support for VIHR versus turnout",
    x = "Voter turnout (%)",
    y = "VIHR support (%)"
  )

ggplot(data = pkseutu) +
  geom_point(mapping = aes(x = Prosentti, y = RKP)) +
  labs(
    title = "Support for RKP versus turnout",
    x = "Voter turnout (%)",
    y = "RKP support (%)"
  )

ggplot(data = pkseutu) +
  geom_point(mapping = aes(x = Prosentti, y = KD)) +
  labs(
    title = "Support for KD versus turnout",
    x = "Voter turnout (%)",
    y = "KD support (%)"
  )

ggplot(data = pkseutu) +
  geom_point(mapping = aes(x = Prosentti, y = Muut)) +
  labs(
    title = "Support for Muut versus turnout",
    x = "Voter turnout (%)",
    y = "Muut support (%)"
  )

# The party whose support seems to be growing as the turnout increases is KOK, as is shown in the first scatterplot.

# Based on the first scattersplot, i) is the only interpreation that can be made. This is because there seems to be a strong positive correlation between voter turnout and support for KOK.
# ii) cannot be made, since correlation does not imply causation. High voter turnout seems to correlate to, but does not guarantee, high support for KOK. "Obviously" is a strong adverb and can only be used if there is a strict proof of causation.
# iii) cannot be made either. Again, correlation does not imply causation. Support for KOK seems to correlate to, but does not guarantee, high voter turnout. The "obviously" adverb cannot be used for the same reason as above.


# b)
# In my opinion, the support for SDP seems to explain well the support for PS.
# We can plot the support for the two parties and add a line of best fit as follows:
ggplot(data = pkseutu, aes(x = SDP, y = PS)) +
  geom_point() +
  geom_smooth(method = "lm")

# Creating a linear regression model to explain support for PS using support for SDP as predictor
model <- lm(PS ~ SDP, data = pkseutu)
predicted_PS_support <- as.numeric(predict(model, data.frame(SDP = 15.5)))
print(predicted_PS_support) # 14.13686
# => Where the explanatory party's support, i.e. SDP's support, is 15.5%, my model predicts support for PS to be 14.13686%.

# c)
# The exception is Otaniemi, as is demonstrated below:
ggplot(data = pkseutu, aes(x = Muut, y = KOK, label = Alue)) +
  geom_text()

filtered_pkseutu <- pkseutu[pkseutu$Alue != "Otaniemi", ]
model_2 <- lm(KOK ~ Muut, data = filtered_pkseutu)
filtered_pkseutu$KOK_predicted <- predict(model_2)
ggplot(data = filtered_pkseutu, aes(x = Muut, y = KOK)) +
  geom_point() +
  geom_line(aes(y = KOK_predicted), color = "blue")

all_parties <- colnames(filtered_pkseutu)[3:length(colnames(filtered_pkseutu))]
all_parties_except_kok_and_muut <- all_parties[all_parties != "KOK" & all_parties != "Muut" & all_parties != "KOK_predicted"]
max_R_squared <- -Inf
selected_party <- ""
selected_model <- NA
filtered_pkseutu <- pkseutu[pkseutu$Alue != "Otaniemi", ]
for (party in all_parties_except_kok_and_muut) {
  formula <- as.formula(paste("KOK ~ Muut +", party))
  new_model <- lm(formula, data = filtered_pkseutu)
  new_R_squared <- summary(new_model)$r.squared
  if (new_R_squared > max_R_squared) {
    max_R_squared <- new_R_squared
    selected_party <- party
    selected_model <- new_model
  }
}
print(max_R_squared) # 0.7871733
print(selected_party) # SDP
summary(selected_model)

predicted_KOK_support <- as.numeric(predict(selected_model, data.frame(Muut = 3.5, SDP = 10.2)))
predicted_KOK_support # 29.31328

# => The support for KOK is predicted to be 29.31328% when Muut support is 3.5% and SDP support is 10.2% in my model.
# This model is particularly suitable because the coefficient of determination is relatively, but not overfittingly, high, at 0.787.
# This means that roughly 78.7% of the variability in the support for KOK can be explained by the support for Muut and the support for SDP.


# d)
model_3 <- lm(KOK ~ SDP + PS + KESK + VAS + VIHR + RKP + KD + Muut, data = pkseutu)
ggplot(data = pkseutu, aes(x = SDP + PS + KESK + VAS + VIHR + RKP + KD + Muut, y = KOK)) +
  geom_point() +
  geom_smooth(method = "lm")
# I notice that there is a very strong negative linear correlation between the support for KOK and the support all other parties, and the model seems to fit perfectly.
# This is entirely understandable, because when we group all other parties together, there is now only two choices: vote for KOK or vote for any other party.
# Any vote for KOK means one less vote for the other parties, and vice versa.
# Thus, the correlation between the support for KOK and the support for the other parties must be perfectly negative.

# Task 2
finland <- read.csv("ek2023.csv", encoding = "latin1")
hki <- read.csv("hki2019.csv", encoding = "latin1")

# a)
largestSupport <- function(df, party) {
  percentages <- df[[party]]
  if (length(percentages) == 0) {
    return("Please check the party name")
  }
  max_support <- max(percentages, na.rm = TRUE)
  max_index <- which.max(percentages)
  max_area <- df[max_index, "Alue"]
  return(paste(party, "support is highest in the", max_area, "region and its support is", max_support, "percent"))
}
print(largestSupport(hki, "PIR"))
print(largestSupport(finland, "SDP"))
print(largestSupport(hki, "XYZ"))

# b)
largestParty <- function(df, region) {
  row <- subset(df, Alue == region)
  if (length(rownames(row)) == 0) {
    return("The area not found")
  }
  numeric_row <- row[, sapply(row, is.numeric)]
  max_support <- max(numeric_row)
  max_party <- names(numeric_row)[which.max(numeric_row)]
  return(paste("The largest party in the", region, "region is", max_party, "and its support is", max_support, "percent"))
}
print(largestParty(hki, "Katajanokka"))
print(largestParty(finland, "Närpiö"))
print(largestParty(finland, "Exactum"))

# c)
visualize <- function(df, party_1, party_2) {
  ggplot(df) +
    geom_point(mapping = aes(x = .data[[party_1]], y = .data[[party_2]], colour = .data[["Suuralue"]])) +
    labs(x = paste("Support for", party_1, "(%)"), y = paste("Support for", party_2, "(%)"), colour = "Major Area") +
    ggtitle(paste("Support for", party_1, "vs support for", party_2, "in Helsinki's voting areas")) +
    theme_minimal()
}
visualize(hki, "VAS", "KOK")

# The picture shows a negative linear correlation between the support for the Left Alliance and the support for the National Coalition Party. Areas with higher support for the Left Alliance tends to have lower support for the National Coalition Party, and vice-versa.
# For instance, areas in the South (Eteläinen) mostly have higher support for the National Coalition Party (20-40%) but lower support for the Left Alliance (5-10%).
# Conversely, areas in the Central (Keskinen) have higher support for the Left Alliance (15-25%) but lower support for the National Coalition Party (5-15%).

# d)
compare <- function(df, region, reference_region, k) {
  row <- subset(df, Alue == region)
  numeric_row <- as.numeric(row[, sapply(row, is.numeric)])
  parties <- colnames(row[, sapply(row, is.numeric)])
  sorted_support <- sort(numeric_row, decreasing = TRUE)

  selected_parties <- c()

  for (support_value in sorted_support[1:k]) {
    party_index <- which(numeric_row == support_value)
    selected_parties <- c(selected_parties, parties[party_index])
  }

  shortened_row_1 <- row[, selected_parties, drop = FALSE]
  row_2 <- subset(df, Alue == reference_region)
  shortened_row_2 <- row_2[, selected_parties, drop = FALSE]

  combined_df <- data.frame(
    Party = selected_parties,
    Region = rep(c(region, reference_region), each = k),
    Support = c(as.numeric(shortened_row_1), as.numeric(shortened_row_2))
  )

  ggplot(combined_df, aes(x = Party, y = Support, fill = Region)) +
    geom_bar(stat = "identity", position = "dodge") +
    labs(
      title = paste("Support for", k, "largest parties in", region, "vs in", reference_region),
      x = "Party",
      y = "Support (%)"
    ) +
    theme_minimal()
}
compare(finland, "Helsinki", "Oulu", 10)

# Task 3
# a)
density_function_of_prior_distribution <- function(theta) {
  if (theta > 0 && theta <= 1/100) {
    return(200 * theta)
  } else if (theta > 1/100 && theta <= 1) {
    return(20000/9900 * (1 - theta))
  } else {
    return(0)
  }
}

theta_values <- seq(0, 1, length.out = 9999)
prior_values <- sapply(theta_values, density_function_of_prior_distribution)

ggplot(data.frame(theta = theta_values, density = prior_values), aes(x = theta, y = density)) +
  geom_line(color = "blue") +
  labs(title = "Prior Distribution of θ", x = "θ", y = "Density") +
  theme_minimal()

# The prior distribution tells us the following about Mr. Fisher’s prior expectations about θ:
# - θ, that is, the probability of the die landing on 1000, is probably very small, since the density function peaks at a value very close to 0.
# - Although Mr. Fisher suspects that the die is weighted and more likely to land on 1000 than on any other number, he only suspects the die to be slightly weighted and not heavily biased, since after the peak, the density function declines linearly without fail as θ gets larger.
# - If the die were unbiased, θ would be exactly 1/1000. Since the density function peaks when θ is at 1/100, it means Mr. Fisher suspects that θ, i.e., the probability of the die landing on 1000, is most likely to be 10 times more than the probability of the die landing on any other number.

# b)
k <- 1234
n <- 234321

log_of_posterior_values <- sapply(theta_values, function(theta) {
  log_prior <- log(density_function_of_prior_distribution(theta))
  if (log_prior == -Inf) return(-Inf)
  log_likelihood <- k * log(theta) + (n - k) * log(1 - theta)
  return(log_prior + log_likelihood)
})

map_estimate <- theta_values[which.max(log_of_posterior_values)]
print(map_estimate) # 0.00530106

plot(
  theta_values,
  log_of_posterior_values,
  type = "l",
  col = "blue",
  lwd = 4,
  main = "Logarithm of the Posterior Distribution of θ",
  xlab = "θ",
  ylab = "Logarithm of Density"
)
abline(v = map_estimate, col = "red", lty = 2)
legend("topright", legend = paste("MAP Estimate: θ ≈", map_estimate), col = "red", lty = 2)

# Since the maximum a posteriori probability estimate is calculated to be 0.0053,
# that is also what Mr. Fisher thinks is the best estimate for θ.

# If the prior distribution had been a uniform distribution, the estimate would have most likely been different, specifically, higher.
# This is because in our non-uniform example, Mr. Fisher believes that the die is only slightly weighted, i.e., he believes
# that θ most likely has a small value, as we already saw in part a) that the density function peaked at a small value near 0.
# This, in turn, causes the MAP estimate to also be a small value near 0. If the prior distribution had been uniform,
# the density function in part a) would have likely peaked at a larger value, and so the MAP estimate would have also been larger.

# The posterior distribution should be logarithmised because:
# 1. The values in the posterior distribution might be very small, causing underflow errors when performing calculations. Logarithmising these values stabilises them in a range that is generally higher, helping us avoid calculation errors.
# 2. Doing so makes calculating the MAP estimate easier. In order to calculate the MAP, we have to find the value of theta that would
# maximise the value of the density function of the posterior distribution. Doing this often requires differentiating the density function.
# Since the density function might be complex in the posterior distribution, differentiating it is difficult, and so is calculating the MAP estimate.
# Logarithmising the posterior distribution, however, gives us a "smooth" curve, which is much easier to differentiate.


# Task 4
# a)
set.seed(1)
options(scipen = 999)
lotterymachine <- function(n) {
  p = 1/18643
  return(rbinom(1, n, p))
}
number_of_rounds <- 10000
tickets_per_round <- 50
total_tickets <- number_of_rounds * tickets_per_round
results <- replicate(number_of_rounds, lotterymachine(tickets_per_round))

# ML estimate for a one-ticket winning probability
ML_estimate = sum(results)/total_tickets
print(ML_estimate) # 0.000032

ste <- sqrt(ML_estimate * (1 - ML_estimate) / total_tickets)
z <- 1.96
moe <- z * ste
lower <- ML_estimate - moe
upper <- ML_estimate + moe
lower # 0.00001632025
upper # 0.00004767975
print(paste0("Our 95% Confidence Interval is: (", lower,", ", upper, ")"))
# Our 95% Confidence Interval is: (0.0000163202508820071, 0.0000476797491179929)

real_probability <- 1/18643
print(real_probability) # 0.00005363944
print(lower <= real_probability && real_probability <= upper) # FALSE

# => Our 95% Confidence Interval does not include the probability of winning.

# If we increase the number of lottery rounds, our confidence interval should become narrower and thus more accurate.
# This is because adding more lottery rounds makes the number of total tickets, and therefore the standard error, i.e. sqrt(ML_estimate * (1 - ML_estimate) / total_tickets), smaller,
# which in turn makes the margin of error smaller.

# If we increase the number of tickets played in each round, the total number of tickets would increase, making the standard error smaller,
# making the margin of error smaller, and thus also making the confidence interval narrower.

# b)
number_of_simulations <- 100
ML_estimates <- numeric(number_of_simulations)
for (i in 1:number_of_simulations) {
  simulation_results <- replicate(number_of_rounds, lotterymachine(tickets_per_round))
  this_round_ML_estimate = sum(simulation_results)/total_tickets
  ML_estimates[i] <- this_round_ML_estimate
}
sorted_ML_estimates = sort(ML_estimates)
lower_2 <- sorted_ML_estimates[round(0.025 * number_of_simulations)]
upper_2 <- sorted_ML_estimates[round(0.975 * number_of_simulations)]
lower_2 # 0.000032
upper_2 # 0.00007
print(paste0("The smallest interval containing 95% of the ML estimates is: (", lower_2, ", ", upper_2, ")"))
# The smallest interval containing 95% of the ML estimates is: (0.000032, 0.00007)

# How do you interpret the above described matter?
# We can interpret our result to mean that if we continue to perform the simulations in part a) an infinite number of times
# and calculate the ML estimate for each simulation, approximately 95% of those estimates will fall into the range (0.000032, 0.00007)

# What happens if you increase the number of simulations?
# If we increase the number of simulations, our interval gets narrower and more accurate as the margin of error becomes smaller.
# As the number of simulations approaches infinity, the percentage of the estimates falling into the interval will approach 95%.
# This is due to the law of large numbers.

# What if you increase the number of tickets played each round?
# If we increase the number of tickets played each round, our interval also gets narrower and more accurate.
# This is also due to the law of large numbers.